From a11097a909e3f9f3cf6916c24f10166f645cbb1f Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 14:39:06 -0500
Subject: [PATCH 01/23] added some null value condition checking on pr files
 for error: ```bash Traceback (most recent call last):   File
 "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py",
 line 451, in trace_task     R = retval = fun(*args, **kwargs)                
  ^^^^^^^^^^^^^^^^^^^^   File
 "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py",
 line 734, in __protected_call__     return self.run(*args, **kwargs)         
   ^^^^^^^^^^^^^^^^^^^^^^^^^   File
 "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/tasks.py",
 line 18, in process_pull_request_files    
 pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth)  
 File
 "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py",
 line 68, in pull_request_files_model     pr_file_rows += [{                  
   ^^   File
 "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py",
 line 68, in <listcomp>     pr_file_rows += [{                     ^^   File
 "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py",
 line 344, in __iter__     coreData['totalCount']     ~~~~~~~~^^^^^^^^^^^^^^
 TypeError: 'NoneType' object is not subscriptable ```

---
 augur/tasks/github/util/gh_graphql_entities.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index cb5df455b7..bba45b0d07 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -341,7 +341,11 @@ def __iter__(self):
             coreData = self.extract_paginate_result(data)
 
             #Check to make sure we have data
-            coreData['totalCount']
+            if coreData['totalCount']: 
+                self.logger.info(f"pr file core data obtained")
+            else: 
+                self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for pr file core data (value): {data} \n Zero value assigned.")
+                coreData['totalCount'] = 0
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")
             self.logger.error(

From 793e1007353bad9e9c735701cdf0e82d919435ba Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 14:49:12 -0500
Subject: [PATCH 02/23] small edit

---
 augur/tasks/github/util/gh_graphql_entities.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index bba45b0d07..5e55df9b16 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -342,9 +342,9 @@ def __iter__(self):
 
             #Check to make sure we have data
             if coreData['totalCount']: 
-                self.logger.info(f"pr file core data obtained")
+                self.logger.info(f"... core data obtained")
             else: 
-                self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for pr file core data (value): {data} \n Zero value assigned.")
+                self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
                 coreData['totalCount'] = 0
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")

From 0cc6a5438591f9bbc28c64e1007fa3b527fdb708 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 17:46:43 -0500
Subject: [PATCH 03/23] collection rate for secondary bump

---
 augur/application/cli/backend.py    | 2 +-
 augur/application/cli/collection.py | 2 +-
 augur/application/cli/tasks.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py
index a0480adab4..d7a8ad745d 100644
--- a/augur/application/cli/backend.py
+++ b/augur/application/cli/backend.py
@@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum):
         sleep_time += 6
 
         #20% of estimate, Maximum value of 25
-        secondary_num_processes = determine_worker_processes(.25, 25)
+        secondary_num_processes = determine_worker_processes(.25, 45)
         logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
         secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
         process_list.append(subprocess.Popen(secondary_worker.split(" ")))
diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py
index 63c433a79e..7d65cad978 100644
--- a/augur/application/cli/collection.py
+++ b/augur/application/cli/collection.py
@@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum):
     sleep_time += 6
 
     #20% of estimate, Maximum value of 25
-    secondary_num_processes = determine_worker_processes(.25, 25)
+    secondary_num_processes = determine_worker_processes(.25, 45)
     logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
     secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
     process_list.append(subprocess.Popen(secondary_worker.split(" ")))
diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py
index b4bec994eb..7b74d208e0 100644
--- a/augur/application/cli/tasks.py
+++ b/augur/application/cli/tasks.py
@@ -37,7 +37,7 @@ def start():
 
     scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
     core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h"
-    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
+    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
     
     scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
     core_worker_process = subprocess.Popen(core_worker.split(" "))

From 4a18742eda9fe953fd8d93c94a483e424c4b494d Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 18:11:54 -0500
Subject: [PATCH 04/23] typo fix

---
 augur/tasks/github/util/gh_graphql_entities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index 5e55df9b16..c5714ce60f 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -344,7 +344,7 @@ def __iter__(self):
             if coreData['totalCount']: 
                 self.logger.info(f"... core data obtained")
             else: 
-                self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
+                self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
                 coreData['totalCount'] = 0
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")

From b230d73e4052433ce51e5c8050bc6e0534e86c6d Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 18:34:53 -0500
Subject: [PATCH 05/23] `is not None` instead.

---
 augur/tasks/github/util/gh_graphql_entities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index c5714ce60f..0e8939cf6c 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -341,7 +341,7 @@ def __iter__(self):
             coreData = self.extract_paginate_result(data)
 
             #Check to make sure we have data
-            if coreData['totalCount']: 
+            if coreData['totalCount'] is not None: 
                 self.logger.info(f"... core data obtained")
             else: 
                 self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")

From f296a07ad0fd83258fc5b183dc3bba97a9502f71 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:07:08 -0500
Subject: [PATCH 06/23] updated is not None logic:

```python
            if coreData is not None:
                self.logger.info(f"... core data obtained")
            else:
                self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
                coreData['totalCount'] = 0
        except KeyError as e:
            self.logger.error("Could not extract paginate result because there was no data returned")
            self.logger.error(
                ''.join(traceback.format_exception(None, e, e.__traceback__)))

            self.logger.info(f"Graphql paramters: {params}")
            return
```
---
 augur/tasks/github/util/gh_graphql_entities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index 0e8939cf6c..ca5dec2f6e 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -341,7 +341,7 @@ def __iter__(self):
             coreData = self.extract_paginate_result(data)
 
             #Check to make sure we have data
-            if coreData['totalCount'] is not None: 
+            if coreData is not None: 
                 self.logger.info(f"... core data obtained")
             else: 
                 self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")

From 8663655ea1355dcaf24ed8586697078d633c9fbf Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:12:00 -0500
Subject: [PATCH 07/23] better update: ```python try:     if coreData is not
 None:         if coreData.get('totalCount') is not None:            
 self.logger.info("... core data obtained")         else:            
 self.logger.info(f"Helen, the ghost in our machine, did not get a numerical
 result for core data (value): {data} \n Zero value assigned.")            
 coreData['totalCount'] = 0     else:         self.logger.error("Core data is
 None, cannot proceed with operations on it.") except KeyError as e:    
 self.logger.error("Could not extract paginate result because there was no
 data returned")    
 self.logger.error(''.join(traceback.format_exception(None, e,
 e.__traceback__))) ```

---
 .../tasks/github/util/gh_graphql_entities.py  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index ca5dec2f6e..503f440a3f 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -341,18 +341,18 @@ def __iter__(self):
             coreData = self.extract_paginate_result(data)
 
             #Check to make sure we have data
-            if coreData is not None: 
-                self.logger.info(f"... core data obtained")
-            else: 
-                self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
-                coreData['totalCount'] = 0
+        try:
+            if coreData is not None:
+                if coreData.get('totalCount') is not None: 
+                    self.logger.info("... core data obtained")
+                else:
+                    self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
+                    coreData['totalCount'] = 0
+            else:
+                self.logger.error("Core data is None, cannot proceed with operations on it.")
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")
-            self.logger.error(
-                ''.join(traceback.format_exception(None, e, e.__traceback__)))
-            
-            self.logger.info(f"Graphql paramters: {params}")
-            return
+            self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__)))
 
 
         if int(coreData['totalCount']) == 0:

From ca6cb7958916a095404d2d360193e237acbfaaec Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:12:48 -0500
Subject: [PATCH 08/23] woops. one error.

---
 augur/tasks/github/util/gh_graphql_entities.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index 503f440a3f..a3bf02ff14 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -341,7 +341,6 @@ def __iter__(self):
             coreData = self.extract_paginate_result(data)
 
             #Check to make sure we have data
-        try:
             if coreData is not None:
                 if coreData.get('totalCount') is not None: 
                     self.logger.info("... core data obtained")

From 0de7fdd2ce96a2d5e0854ab37eb2d8d694267b75 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:15:12 -0500
Subject: [PATCH 09/23] one more.

---
 augur/tasks/github/util/gh_graphql_entities.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index a3bf02ff14..1ad7ba5ee1 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -348,7 +348,9 @@ def __iter__(self):
                     self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
                     coreData['totalCount'] = 0
             else:
-                self.logger.error("Core data is None, cannot proceed with operations on it.")
+                self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.")
+                coreData['totalCount'] = 0
+
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")
             self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__)))

From 85bd70f83e778c49e9881dc7312c0dc5c26b46db Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:16:43 -0500
Subject: [PATCH 10/23] another tweak.

---
 augur/tasks/github/util/gh_graphql_entities.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index 1ad7ba5ee1..9a4b041f4d 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -346,10 +346,12 @@ def __iter__(self):
                     self.logger.info("... core data obtained")
                 else:
                     self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
-                    coreData['totalCount'] = 0
+                    yield None 
+                    return 
             else:
                 self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.")
-                coreData['totalCount'] = 0
+                yield None 
+                return 
 
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")

From e9b83c6550b3add3c2248f3ea501c02028f03001 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 21:17:57 -0500
Subject: [PATCH 11/23] and another

---
 augur/tasks/github/util/gh_graphql_entities.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index 9a4b041f4d..c3ace4ee0c 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -346,8 +346,7 @@ def __iter__(self):
                     self.logger.info("... core data obtained")
                 else:
                     self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
-                    yield None 
-                    return 
+                    coreData['totalCount'] = 0 
             else:
                 self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.")
                 yield None 

From b244c608da025421a98d7be06f902fb092ae6791 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Fri, 10 May 2024 22:14:26 -0500
Subject: [PATCH 12/23] another tweak, as the error just moved up: ```python   
          coreData = self.extract_paginate_result(data) ```

```python
Traceback (most recent call last):
  File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task
    R = retval = fun(*args, **kwargs)
                 ^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__
    return self.run(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/tasks.py", line 18, in process_pull_request_files
    pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth)
  File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in pull_request_files_model
    pr_file_rows += [{
                    ^^
  File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in <listcomp>
    pr_file_rows += [{
                    ^^
  File "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py", line 341, in __iter__
    coreData = self.extract_paginate_result(data)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py", line 253, in extract_paginate_result
    raise TimeoutError("No data received from endpoint.")
TimeoutError: No data received from endpoint.
```
---
 augur/tasks/github/util/gh_graphql_entities.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py
index c3ace4ee0c..574adbbaf0 100644
--- a/augur/tasks/github/util/gh_graphql_entities.py
+++ b/augur/tasks/github/util/gh_graphql_entities.py
@@ -338,9 +338,8 @@ def __iter__(self):
         #self.logger.info(f"{params}")
         data = self.request_graphql_dict(variables=params)
         try:
-            coreData = self.extract_paginate_result(data)
-
             #Check to make sure we have data
+            coreData = self.extract_paginate_result(data)
             if coreData is not None:
                 if coreData.get('totalCount') is not None: 
                     self.logger.info("... core data obtained")
@@ -351,7 +350,6 @@ def __iter__(self):
                 self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.")
                 yield None 
                 return 
-
         except KeyError as e:
             self.logger.error("Could not extract paginate result because there was no data returned")
             self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__)))

From 478048d4f55bb8fc959d8ef036210fdae96b6c3d Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Sat, 11 May 2024 10:31:01 -0500
Subject: [PATCH 13/23] Issue with extremely large repository pull_requests
 fixed by adding incremental inserts.

Error Details:
- OS Level OOM Error Log
- Augur error log
- Database state showing that the killed collection process leaves all subsequent core tasks hanging after the OOM.

```bash
May 11 08:40:26 ip-172-31-43-26 kernel: [196984.540841] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
May 11 08:40:26 ip-172-31-43-26 kernel: [196984.543107] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=user.slice,mems_allowed=0,global_oom,task_memcg=/user.slice/user-1000.slice/session-330.scope,task=celery,pid=2787413,uid=1000
May 11 08:40:26 ip-172-31-43-26 kernel: [196984.543138] Out of memory: Killed process 2787413 (celery) total-vm:14657984kB, anon-rss:8728064kB, file-rss:5632kB, shmem-rss:0kB, UID:1000 pgtables:17844kB oom_score_adj:0
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229829] Softwar~cThread invoked oom-killer: gfp_mask=0x140cca(GFP_HIGHUSER_MOVABLE|__GFP_COMP), order=0, oom_score_adj=0
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229874]  oom_kill_process+0x10c/0x1b0
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229884]  __alloc_pages_may_oom+0x114/0x1e0
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.230235] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.232557] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=user.slice,mems_allowed=0,global_oom,task_memcg=/user.slice/user-1000.slice/session-330.scope,task=celery,pid=2787363,uid=1000
May 11 11:33:17 ip-172-31-43-26 kernel: [207355.232580] Out of memory: Killed process 2787363 (celery) total-vm:14415148kB, anon-rss:8500952kB, file-rss:9216kB, shmem-rss:0kB, UID:1000 pgtables:17364kB oom_score_adj:0

augur.tasks.github.pull_requests.tasks.collect_pull_requests cf2337d6-6adb-4b0b-a47d-cfd74c01d86a
Name 	augur.tasks.github.pull_requests.tasks.collect_pull_requests
UUID 	cf2337d6-6adb-4b0b-a47d-cfd74c01d86a
State 	FAILURE
args 	('https://github.com/kubernetes/kubernetes',)
kwargs 	{}
Result 	None
Received 	2024-05-11 02:19:36.542103 UTC
Started 	2024-05-11 02:19:36.543418 UTC
Failed 	2024-05-11 05:03:50.075814 UTC
Retries 	0
Worker 	core:c1d207996f614c648319b1c800672fce@ip-172-31-43-26
Exception 	WorkerLostError('Worker exited prematurely: signal 9 (SIGKILL) Job: 35.')
Timestamp 	2024-05-11 05:03:50.075814 UTC
Traceback

Traceback (most recent call last):
  File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/pool.py", line 1265, in mark_as_worker_lost
    raise WorkerLostError(
billiard.exceptions.WorkerLostError: Worker exited prematurely: signal 9 (SIGKILL) Job: 35.

Clock 	56697
Root 	<Task: augur.tasks.start_tasks.augur_collection_monitor(f93b7451-cfb2-42fc-a7b8-4ea8fe59d647) SUCCESS clock:222>
Root id 	f93b7451-cfb2-42fc-a7b8-4ea8fe59d647
Parent 	<Task: augur.tasks.github.detect_move.tasks.detect_github_repo_move_core(41327ca8-c735-44c5-a536-38acd8968e42) SUCCESS clock:244>
Parent id 	41327ca8-c735-44c5-a536-38acd8968e42
Children

Augur thinks we are still collecting, so it will never get to messages:
repo_id	core_data_last_collected	core_status	core_task_id	secondary_data_last_collected	secondary_status	secondary_task_id	event_last_collected	facade_status	facade_data_last_collected	facade_task_id	core_weight	facade_weight	secondary_weight	issue_pr_sum	commit_sum	ml_status	ml_data_last_collected	ml_task_id	ml_weight
123948	2024-02-09 04:43:29	Collecting	9bef29e6-9519-4acb-8469-24c6857c8d92	2022-12-20 00:00:00	Success			Success	2024-04-10 06:07:05		-1249186522	121948	-52204734437	203819	121948	Pending			-12149126357```
---
 augur/tasks/github/pull_requests/tasks.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py
index 69e40f6818..04c7ef7a0d 100644
--- a/augur/tasks/github/pull_requests/tasks.py
+++ b/augur/tasks/github/pull_requests/tasks.py
@@ -68,6 +68,15 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
         logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}")
 
         all_data += page_data
+        
+        if len(all_data) >= 200:
+            with GithubTaskManifest(logger) as manifest:
+                augur_db = manifest.augur_db
+                repo_id = augur_db.session.query(Repo).filter(
+                Repo.repo_git == repo_git).one().repo_id
+                owner, repo = get_owner_repo(repo_git)
+            process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
+            all_data.clear()
 
     return all_data
 

From e1f99501f62a55d9f6b2c7d73218f4baefc8a050 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Sat, 11 May 2024 11:14:05 -0500
Subject: [PATCH 14/23] Added error logging to dependency task for this error,
 which I think arises from not having any files in our annointed programming
 languages, which are all the most common ones. ```bash

Traceback (most recent call last):
  File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task
    R = retval = fun(*args, **kwargs)
                 ^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__
    return self.run(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/augur/augur/tasks/git/dependency_tasks/tasks.py", line 47, in process_ossf_dependency_metrics
    generate_scorecard(session, repo.repo_id, repo_git)
  File "/home/ubuntu/github/augur/augur/tasks/git/dependency_tasks/core.py", line 75, in generate_scorecard
    required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/github/augur/augur/tasks/util/worker_util.py", line 141, in parse_json_from_subprocess_call
    raise e
  File "/home/ubuntu/github/augur/augur/tasks/util/worker_util.py", line 138, in parse_json_from_subprocess_call
    required_output = json.loads(output)
                      ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

```
---
 augur/tasks/git/dependency_tasks/core.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py
index 296e69075e..e4c6273479 100644
--- a/augur/tasks/git/dependency_tasks/core.py
+++ b/augur/tasks/git/dependency_tasks/core.py
@@ -72,7 +72,11 @@ def generate_scorecard(session,repo_id,path):
     key_handler = GithubApiKeyHandler(session, session.logger)       
     os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key()
     
-    required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
+    try: 
+        required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
+    except Exception as e: 
+        session.logger.error(f"Could not parse required output! Error: {e}")
+        raise e        
     
     session.logger.info('adding to database...')
     session.logger.debug(f"output: {required_output}")

From f50d1240314e854da791f7f8b7fe70f649f10e5e Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Mon, 13 May 2024 17:48:39 -0500
Subject: [PATCH 15/23] changing frequency

Signed-off-by: Sean P. Goggins <s@goggins.com>
---
 augur/tasks/github/pull_requests/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py
index 04c7ef7a0d..36a7aab1b3 100644
--- a/augur/tasks/github/pull_requests/tasks.py
+++ b/augur/tasks/github/pull_requests/tasks.py
@@ -69,7 +69,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
 
         all_data += page_data
         
-        if len(all_data) >= 200:
+        if len(all_data) >= 2000:
             with GithubTaskManifest(logger) as manifest:
                 augur_db = manifest.augur_db
                 repo_id = augur_db.session.query(Repo).filter(

From f55ed92bd91d4c4d999219f071793c49cc997d11 Mon Sep 17 00:00:00 2001
From: Ulincsys <ulincsys@gmail.com>
Date: Mon, 13 May 2024 18:00:48 -0500
Subject: [PATCH 16/23] Make retrieve_all_pr_data a generator

---
 augur/tasks/github/pull_requests/tasks.py | 36 ++++++++++-------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py
index 36a7aab1b3..d2bbe55c90 100644
--- a/augur/tasks/github/pull_requests/tasks.py
+++ b/augur/tasks/github/pull_requests/tasks.py
@@ -12,6 +12,8 @@
 from augur.application.db.util import execute_session_query
 from ..messages.tasks import process_github_comment_contributors
 
+from typing import Generator, List, Dict
+
 
 platform_id = 1
 
@@ -29,12 +31,18 @@ def collect_pull_requests(repo_git: str) -> int:
         Repo.repo_git == repo_git).one().repo_id
 
         owner, repo = get_owner_repo(repo_git)
-        pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth)
 
-        if pr_data:
-            process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
+        total_count = 0
+        all_data = []
+        for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth):
+            all_data += page
+
+            if len(all_data) >= 1000:
+                process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
+                total_count += len(all_data)
+                all_data.clear()
 
-            return len(pr_data)
+            return total_count
         else:
             logger.info(f"{owner}/{repo} has no pull requests")
             return 0
@@ -42,7 +50,7 @@ def collect_pull_requests(repo_git: str) -> int:
     
 # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers
 # TODO: Fix column names in pull request labels table
-def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
+def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> Generator[List[Dict]]:
 
     owner, repo = get_owner_repo(repo_git)
 
@@ -52,33 +60,21 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
     # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs)
     prs = GithubPaginator(url, key_auth, logger)
 
-    all_data = []
     num_pages = prs.get_num_pages()
     for page_data, page in prs.iter_pages():
 
         if page_data is None:
-            return all_data
+            return
 
         if len(page_data) == 0:
             logger.debug(
                 f"{owner}/{repo} Prs Page {page} contains no data...returning")
             logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}")
-            return all_data
+            return
 
         logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}")
-
-        all_data += page_data
         
-        if len(all_data) >= 2000:
-            with GithubTaskManifest(logger) as manifest:
-                augur_db = manifest.augur_db
-                repo_id = augur_db.session.query(Repo).filter(
-                Repo.repo_git == repo_git).one().repo_id
-                owner, repo = get_owner_repo(repo_git)
-            process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
-            all_data.clear()
-
-    return all_data
+        yield page_data
 
 
 def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db):

From d4e276202600432b04870ce3020b5d6ed83c5a47 Mon Sep 17 00:00:00 2001
From: Ulincsys <ulincsys@gmail.com>
Date: Mon, 13 May 2024 18:10:27 -0500
Subject: [PATCH 17/23] Add un-pushed changes

---
 augur/tasks/github/pull_requests/tasks.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py
index d2bbe55c90..f739105f49 100644
--- a/augur/tasks/github/pull_requests/tasks.py
+++ b/augur/tasks/github/pull_requests/tasks.py
@@ -42,11 +42,17 @@ def collect_pull_requests(repo_git: str) -> int:
                 total_count += len(all_data)
                 all_data.clear()
 
+        if len(all_data):
+            process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
+            total_count += len(all_data)
+
+        if total_count > 0:
             return total_count
         else:
             logger.info(f"{owner}/{repo} has no pull requests")
             return 0
         
+        
     
 # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers
 # TODO: Fix column names in pull request labels table

From 6e98367c33b72671a638c022bcbaacfa4e1b0be7 Mon Sep 17 00:00:00 2001
From: Andrew Brain <andrewbrain2019@gmail.com>
Date: Mon, 13 May 2024 19:52:35 -0500
Subject: [PATCH 18/23] Simplify collection

Signed-off-by: Andrew Brain <andrewbrain2019@gmail.com>
---
 augur/tasks/util/collection_util.py | 162 +++++++++-------------------
 1 file changed, 50 insertions(+), 112 deletions(-)

diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py
index 3561b19b40..b8e4cda2fd 100644
--- a/augur/tasks/util/collection_util.py
+++ b/augur/tasks/util/collection_util.py
@@ -132,58 +132,73 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab
     def get_active_repo_count(self,session):
         return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all())
 
-    #Get repo urls based on passed in info.
+
     def get_valid_repos(self,session):
-        #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook
-        #Get the count of repos that are currently running this collection hook
-        #status_column = f"{hook}_status"
+
         active_repo_count = self.get_active_repo_count(session)
+        limit = self.max_repo-active_repo_count
 
-        #Will always disallow errored repos and repos that are already collecting
+        if limit <= 0:
+            return
 
-        #The maximum amount of repos to schedule is affected by the existing repos running tasks
-        limit = self.max_repo-active_repo_count
+        collection_list = get_newly_added_repos(session, limit, hook=self.name)
+        self.repo_list.extend(collection_list)
+        limit -= len(collection_list)
 
-        #Extract the user id from the randomized list and split into four chunks
-        split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status)
+        #Now start recollecting other repos if there is space to do so.
+        if limit <= 0:
+            return
 
-        session.logger.info(f"User_list: {split_user_list}")
+        collection_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again)
 
-        #Iterate through each fourth of the users fetched
-        for quarter_list in split_user_list:
-            if limit <= 0:
-                return
+        self.repo_list.extend(collection_list)
 
-            collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again)
 
-            self.repo_list.extend(collection_list)
-            #Update limit with amount of repos started
-            limit -= len(collection_list)
+def get_newly_added_repos(session, limit, hook):
 
-        #Now start old repos if there is space to do so.
-        if limit <= 0:
-            return
+    if hook in ["core", "secondary", "ml"]:
+        condition_string = f"""{hook}_status='{str(CollectionState.PENDING.value)}'"""
+        
+    elif hook == "facade":
+        condition_string = f"""facade_status='{str(CollectionState.UPDATE.value)}'"""
+
+
+    repo_query = s.sql.text(f"""
+        select repo_git 
+        from augur_operations.collection_status x, augur_data.repo y 
+        where x.repo_id=y.repo_id 
+        and {condition_string}
+        order by repo_added
+        limit :limit_num
+    """).bindparams(limit_num=limit)
 
+    valid_repos = session.execute_sql(repo_query).fetchall()
+    valid_repo_git_list = [repo[0] for repo in valid_repos]
 
-        user_list = get_list_of_all_users(session)
-        random.shuffle(user_list)
+    return valid_repo_git_list
 
-        #Extract the user id from the randomized list and split into four chunks
-        split_user_list = split_list_into_chunks([row[0] for row in user_list], 4)
+def get_repos_for_recollection(session, limit, hook, days_until_collect_again):
 
-        for quarter_list in split_user_list:
+    if hook in ["core", "secondary", "ml"]:
+        condition_string = f"""{hook}_status='{str(CollectionState.SUCCESS.value)}'"""
         
-            #Break out if limit has been reached
-            if limit <= 0:
-                return
+    elif hook == "facade":
+        condition_string = f"""facade_status='{str(CollectionState.SUCCESS.value)}'"""
 
-            #only start repos older than the specified amount of days
-            #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored
-            #Order by the relevant weight for the collection hook
-            collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again)
+    repo_query = s.sql.text(f"""
+        select repo_git 
+        from augur_operations.collection_status x,  repo y 
+        where x.repo_id = y.repo_id
+        and {condition_string}
+        and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS'
+        order by {hook}_data_last_collected 
+        limit :limit_num
+    """).bindparams(limit_num=limit)
 
-            self.repo_list.extend(collection_list)
-            limit -= len(collection_list)
+    valid_repos = session.execute_sql(repo_query).fetchall()
+    valid_repo_git_list = [repo[0] for repo in valid_repos]
+
+    return valid_repo_git_list
 
 
 def get_enabled_phase_names_from_config():
@@ -610,80 +625,3 @@ def send_messages(self):
 
                         #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated
                         yield repo_git, task_id, col_hook.name
-
-#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"):
-#
-#    logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos")
-#    if len(repo_git_identifiers) == 0:
-#        return 0
-#    
-#    logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}")
-#
-#    routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook)
-#
-#    routine.start_data_collection()
-#
-#    return len(repo_git_identifiers)
-
-def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1):
-
-    condition_string = "1"
-
-    if hook == "core":
-        condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection)
-    elif hook == "secondary":
-        condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection)
-    elif hook == "facade":
-        condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection)
-    elif hook == "ml":
-        condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection)
-
-    #Query a set of valid repositories sorted by weight, also making sure that the repos are new
-    #Order by the relevant weight for the collection hook
-    repo_query = s.sql.text(f"""
-        SELECT DISTINCT repo.repo_id, repo.repo_git, collection_status.{hook}_weight
-        FROM augur_operations.user_groups 
-        JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id
-        JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id
-        JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id
-        WHERE user_id IN :list_of_user_ids AND {condition_string}
-        ORDER BY augur_operations.collection_status.{hook}_weight
-        LIMIT :limit_num
-    """).bindparams(list_of_user_ids=users,limit_num=limit)
-
-    #Get a list of valid repo ids, limit set to 2 times the usual
-    valid_repos = session.execute_sql(repo_query).fetchall()
-    valid_repo_git_list = [repo[1] for repo in valid_repos]
-
-    session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}")
-    
-    #start repos for new primary collection hook
-    #collection_size = start_block_of_repos(
-    #    session.logger, session,
-    #    valid_repo_git_list,
-    #    phases, repos_type=repos_type, hook=hook
-    #)
-
-    return valid_repo_git_list
-
-def split_random_users_list(session,status_col, status_new):
-    #Split all users that have new repos into four lists and randomize order
-    query = s.sql.text(f"""
-        SELECT  
-        user_id
-        FROM augur_operations.user_groups 
-        JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id
-        JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id
-        JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id
-        WHERE {status_col}='{str(status_new)}'
-        GROUP BY user_id
-    """)
-
-    user_list = session.execute_sql(query).fetchall()
-    random.shuffle(user_list)
-
-    #Extract the user id from the randomized list and split into four chunks
-    split_user_list = split_list_into_chunks([row[0] for row in user_list], 4)
-
-    return split_user_list
-

From 8f596cc614615225ab4d20251a0c1ece3bcc3311 Mon Sep 17 00:00:00 2001
From: Andrew Brain <andrewbrain2019@gmail.com>
Date: Mon, 13 May 2024 20:00:12 -0500
Subject: [PATCH 19/23] Fix annotation

Signed-off-by: Andrew Brain <andrewbrain2019@gmail.com>
---
 augur/tasks/github/pull_requests/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py
index f739105f49..73ea1b025a 100644
--- a/augur/tasks/github/pull_requests/tasks.py
+++ b/augur/tasks/github/pull_requests/tasks.py
@@ -56,7 +56,7 @@ def collect_pull_requests(repo_git: str) -> int:
     
 # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers
 # TODO: Fix column names in pull request labels table
-def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> Generator[List[Dict]]:
+def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]:
 
     owner, repo = get_owner_repo(repo_git)
 

From f6d5680ddb42166f5dad8b810126bea6e9630df6 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Tue, 14 May 2024 11:40:26 -0500
Subject: [PATCH 20/23] small dial back of concurrency which didn't make sense.

---
 augur/application/cli/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py
index 7b74d208e0..c64dce5b88 100644
--- a/augur/application/cli/tasks.py
+++ b/augur/application/cli/tasks.py
@@ -37,7 +37,7 @@ def start():
 
     scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
     core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h"
-    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
+    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
     
     scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
     core_worker_process = subprocess.Popen(core_worker.split(" "))

From 03fc1fe6df59ae94495cc7c65521a5066bf25d6e Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Wed, 15 May 2024 11:52:20 -0500
Subject: [PATCH 21/23] updated versions

Signed-off-by: Sean P. Goggins <s@goggins.com>
---
 README.md                  | 4 ++--
 docker/backend/Dockerfile  | 2 +-
 docker/database/Dockerfile | 2 +-
 docker/rabbitmq/Dockerfile | 2 +-
 metadata.py                | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3f83946f5d..02ec125fb6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Augur NEW Release v0.63.3
+# Augur NEW Release v0.70.0
 
 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! 
 The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io 
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
 ## NEW RELEASE ALERT!
 ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)
 
-Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.3 
+Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0  
 
 - The `main` branch is a stable version of our new architecture, which features:
   - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile
index 1ec7871c9b..6e158d199b 100644
--- a/docker/backend/Dockerfile
+++ b/docker/backend/Dockerfile
@@ -2,7 +2,7 @@
 FROM python:3.10-bookworm
 
 LABEL maintainer="outdoors@acm.org"
-LABEL version="0.63.3"
+LABEL version="0.70.0"
 
 ENV DEBIAN_FRONTEND=noninteractive
 
diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile
index effe34d3b2..1421e1f76c 100644
--- a/docker/database/Dockerfile
+++ b/docker/database/Dockerfile
@@ -2,7 +2,7 @@
 FROM postgres:14
 
 LABEL maintainer="outdoors@acm.org"
-LABEL version="0.63.3"
+LABEL version="0.70.0"
 
 ENV POSTGRES_DB "test"
 ENV POSTGRES_USER "augur"
diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile
index 80542c788b..079c73dc99 100644
--- a/docker/rabbitmq/Dockerfile
+++ b/docker/rabbitmq/Dockerfile
@@ -1,7 +1,7 @@
 FROM rabbitmq:3.12-management-alpine
 
 LABEL maintainer="574/augur@simplelogin.com"
-LABEL version="0.63.3"
+LABEL version="0.70.0"
 
 ARG RABBIT_MQ_DEFAULT_USER
 ARG RABBIT_MQ_DEFAULT_PASSWORD
diff --git a/metadata.py b/metadata.py
index 841521b515..b914869d58 100644
--- a/metadata.py
+++ b/metadata.py
@@ -5,8 +5,8 @@
 
 __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection"
 
-__version__ = "0.63.3"
-__release__ = "v0.63.3 (Supply Chain Gang)"
+__version__ = "0.70.0"
+__release__ = "v0.70.0 (Windows 95 Man!)"
 
 __license__ = "MIT"
 __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024"

From 2241f3383d58688028fd2478bc6ac6d501a192a7 Mon Sep 17 00:00:00 2001
From: Andrew Brain <andrewbrain2019@gmail.com>
Date: Thu, 16 May 2024 21:54:02 -0500
Subject: [PATCH 22/23] Fix issue where secondary tries to collect before core
 is collected

Signed-off-by: Andrew Brain <andrewbrain2019@gmail.com>
---
 augur/tasks/util/collection_util.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py
index b8e4cda2fd..9776258626 100644
--- a/augur/tasks/util/collection_util.py
+++ b/augur/tasks/util/collection_util.py
@@ -156,12 +156,15 @@ def get_valid_repos(self,session):
 
 def get_newly_added_repos(session, limit, hook):
 
+    condition_string = ""
     if hook in ["core", "secondary", "ml"]:
-        condition_string = f"""{hook}_status='{str(CollectionState.PENDING.value)}'"""
+        condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'"""
         
     elif hook == "facade":
-        condition_string = f"""facade_status='{str(CollectionState.UPDATE.value)}'"""
+        condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'"""
 
+    if hook == "secondary":
+        condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'"""
 
     repo_query = s.sql.text(f"""
         select repo_git 

From 16f6d6b3e4d96514542a39b4be3f70b55f20d4a7 Mon Sep 17 00:00:00 2001
From: "Sean P. Goggins" <s@goggins.com>
Date: Tue, 21 May 2024 13:33:21 +0300
Subject: [PATCH 23/23] commented out the rebuilding of the dm_ tables. This
 should be rebuilt using a materialized view.

Signed-off-by: Sean P. Goggins <s@goggins.com>
---
 .../facade_worker/rebuildcache.py             | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py
index 5668739767..e4697dbc19 100644
--- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py
+++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py
@@ -396,7 +396,8 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     # ("DELETE c.* FROM dm_repo_group_weekly c "
     #   "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_group_weekly)
+
+#    session.execute_sql(clear_dm_repo_group_weekly)
 
     clear_dm_repo_group_monthly = s.sql.text("""
             DELETE 
@@ -410,7 +411,8 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     # ("DELETE c.* FROM dm_repo_group_monthly c "
     #   "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_group_monthly)
+
+#    session.execute_sql(clear_dm_repo_group_monthly)
 
     clear_dm_repo_group_annual = s.sql.text("""
             DELETE 
@@ -424,7 +426,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     # ("DELETE c.* FROM dm_repo_group_annual c "
     #   "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_group_annual)
+#    session.execute_sql(clear_dm_repo_group_annual)
 
     clear_dm_repo_weekly = s.sql.text("""
             DELETE 
@@ -441,7 +443,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     #   "JOIN repo r ON c.repo_id = r.repo_id "
     #   "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_weekly)
+#    session.execute_sql(clear_dm_repo_weekly)
 
     clear_dm_repo_monthly = s.sql.text("""
             DELETE 
@@ -458,7 +460,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     #   "JOIN repo r ON c.repo_id = r.repo_id "
     #   "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_monthly)
+#    session.execute_sql(clear_dm_repo_monthly)
 
     clear_dm_repo_annual = s.sql.text("""
             DELETE 
@@ -475,7 +477,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
     #   "JOIN repo r ON c.repo_id = r.repo_id "
     #   "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
     #   "p.rg_recache=TRUE")
-    session.execute_sql(clear_dm_repo_annual)
+#    session.execute_sql(clear_dm_repo_annual)
 
     clear_unknown_cache = s.sql.text("""
             DELETE 
@@ -573,7 +575,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
         "r.repo_group_id, info.a, info.b, info.c")
         ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)
 
-    session.execute_sql(cache_projects_by_week)
+#    session.execute_sql(cache_projects_by_week)
 
     cache_projects_by_month = s.sql.text(
         ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) "
@@ -609,7 +611,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
         "r.repo_group_id, info.a, info.b, info.c"
         )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)
 
-    session.execute_sql(cache_projects_by_month)
+#    session.execute_sql(cache_projects_by_month)
 
     cache_projects_by_year = s.sql.text((
         "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) "
@@ -649,7 +651,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
      
      
 
-    session.execute_sql(cache_projects_by_year)
+ #   session.execute_sql(cache_projects_by_year)
     # Start caching by repo
 
     session.log_activity('Verbose','Caching repos')
@@ -689,7 +691,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
         "a.repo_id, info.a, info.b, info.c"
         )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)
 
-    session.execute_sql(cache_repos_by_week)
+#    session.execute_sql(cache_repos_by_week)
 
     cache_repos_by_month = s.sql.text((
         "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)"
@@ -725,7 +727,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
         "a.repo_id, info.a, info.b, info.c"
         )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)
 
-    session.execute_sql(cache_repos_by_month)
+#    session.execute_sql(cache_repos_by_month)
 
     cache_repos_by_year = s.sql.text((
         "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)"
@@ -759,7 +761,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
         "a.repo_id, info.a, info.b, info.c"
         )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)
 
-    session.execute_sql(cache_repos_by_year)
+#    session.execute_sql(cache_repos_by_year)
 
     # Reset cache flags