quack-ai · frgfm · Dec 11, 2023 · Dec 9, 2023 · Dec 9, 2023 · Dec 9, 2023
@@ -6,6 +6,7 @@
 import logging
 from base64 import b64decode
 from datetime import datetime
+from functools import partial
 from typing import List, cast
 
 from fastapi import APIRouter, Depends, HTTPException, Path, Security, status
@@ -20,6 +21,7 @@
 from app.services.openai import openai_client
 from app.services.slack import slack_client
 from app.services.telemetry import telemetry_client
+from app.services.utils import execute_in_parallel
 
 logger = logging.getLogger("uvicorn.error")
 router = APIRouter()
@@ -177,26 +179,63 @@
     telemetry_client.capture(user.id, event="repo-parse-guidelines", properties={"repo_id": repo_id})
     # Sanity check
     repo = cast(Repository, await repos.get(repo_id, strict=True))
-    # STATIC CONTENT
+    # Stage all the text sources
+    sources = []
     # Parse CONTRIBUTING (README if CONTRIBUTING doesn't exist)
     contributing = gh_client.get_file(repo.full_name, "CONTRIBUTING.md", payload.github_token)
-    # readme = gh_client.get_readme(payload.github_token)
-    # diff_hunk, body, path
-    # comments = gh_client.list_review_comments(payload.github_token)
+    readme = gh_client.get_readme(repo.full_name, payload.github_token) if contributing is None else None
+    if contributing is not None:
+        sources.append((contributing["path"], b64decode(contributing["content"]).decode()))
+    if readme is not None:
+        sources.append((readme["path"], b64decode(readme["content"]).decode()))
+    # Pull request comments (!= review comments/threads)
+    pull_comments = [
+        pull
+        for pull in gh_client.fetch_pull_comments_from_repo(repo.full_name, token=payload.github_token)
+        if len(pull["comments"]) > 0
+    ]
+    if len(pull_comments) > 0:
+        # Keep: body, user/id, reactions/total_count
+        corpus = "# Pull request comments\n\n\n\n\n\n".join([
+            f"PULL REQUEST {pull['pull']['number']} from user {pull['pull']['user_id']}\n\n"
+            + "\n\n".join(f"[User {comment['user_id']}] {comment['body']}" for comment in pull["comments"])
+            for pull in pull_comments
+        ])
+        sources.append(("pull_request_comments", corpus))
+    # Review threads
+    review_comments = [
+        pull
+        for pull in gh_client.fetch_reviews_from_repo(repo.full_name, token=payload.github_token)
+        if len(pull["threads"]) > 0
+    ]
     # Ideas: filter on pulls with highest amount of comments recently, add the review output rejection/etc
+    if len(review_comments) > 0:
+        # Keep: code, body, user/id, reactions/total_count
+        corpus = "# Code review history\n\n\n\n\n\n".join([
+            f"PULL: {pull['pull']['number']} from user {pull['pull']['user_id']}\n\n"
+            + "\n\n".join(
+                f"[Code diff]\n```{thread[0]['code']}\n```\n"
+                + "\n".join(f"[User {comment['user_id']}] {comment['body']}" for comment in thread)
+                for thread in pull["threads"]
+            )
+            for pull in review_comments
+        ])
+        sources.append(("review_comments", corpus))
     # If not enough information, raise error
-    if contributing is None:
+    if len(sources) == 0:
         raise HTTPException(status.HTTP_404_NOT_FOUND, detail="No useful information is accessible in the repository")
-    # Analyze with LLM
-    contributing_guidelines = openai_client.parse_guidelines_from_text(
-        b64decode(contributing["content"]).decode(),
-        user_id=str(user.id),
+    # Process all sources in parallel
+    responses = execute_in_parallel(
+        partial(openai_client.parse_guidelines_from_text, user_id=str(user.id)),
+        (corpus for _, corpus in sources),
+        num_threads=len(sources),
     )
-    # contributing_guidelines = ollama_client.parse_guidelines_from_text(b64decode(contributing["content"]).decode())
-    return [
-        ParsedGuideline(**guideline.dict(), repo_id=repo_id, origin_path=contributing["path"])
-        for guideline in contributing_guidelines
+    guidelines = [
+        ParsedGuideline(**guideline.dict(), repo_id=repo_id, source=source)
+        for (source, _), response in zip(sources, responses)
+        for guideline in response
     ]
+    return guidelines
 
 
 @router.post("/{repo_id}/waitlist", status_code=status.HTTP_200_OK, summary="Add a GitHub repository to the waitlist")

@@ -47,7 +47,7 @@ def sqlachmey_uri(cls, v: str) -> str:
     JWT_ENCODING_ALGORITHM: str = "HS256"
     # Compute
     OPENAI_API_KEY: str = os.environ["OPENAI_API_KEY"]
-    OPENAI_MODEL: OpenAIModel = OpenAIModel.GPT4_TURBO
+    OPENAI_MODEL: OpenAIModel = OpenAIModel.GPT3_5_TURBO
 
     # Error monitoring
     SENTRY_DSN: Union[str, None] = os.environ.get("SENTRY_DSN")

@@ -30,13 +30,13 @@ class GuidelineExample(BaseModel):
 
 
 class GuidelineContent(BaseModel):
-    title: str = Field(..., min_length=6, max_length=100)
+    title: str = Field(..., min_length=3, max_length=100)
     details: str = Field(..., min_length=6, max_length=1000)
 
 
 class ParsedGuideline(GuidelineContent):
     repo_id: int = Field(..., gt=0)
-    origin_path: str
+    source: str
 
 
 class GuidelineLocation(BaseModel):

@@ -79,7 +79,7 @@ class ChatCompletion(BaseModel):
     frequency_penalty: float = 1.0
     response_format: _ResponseFormat = _ResponseFormat(type="json_object")
     user: Union[str, None] = None
-    # seed: int = 42
+    seed: int = 42
 
 
 class GHTokenRequest(BaseModel):

@@ -4,7 +4,9 @@
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0> for full license details.
 
 import logging
-from typing import Any, Dict, List, Union
+from functools import partial
+from operator import itemgetter
+from typing import Any, Dict, List, Union, cast
 
 import requests
 from fastapi import HTTPException, status
@@ -13,12 +15,19 @@
 from app.core.config import settings
 from app.models import User, UserScope
 from app.schemas.services import GHToken, GHTokenRequest
+from app.services.utils import execute_in_parallel
 
 logger = logging.getLogger("uvicorn.error")
 
 __all__ = ["gh_client"]
 
 
+def resolve_diff_section(diff_hunk: str, first_line: int, last_line: int) -> str:
+    """Assumes the diff_hunk's last line is the last_line"""
+    num_lines = last_line - first_line + 1
+    return "\n".join(diff_hunk.split("\n")[-num_lines:])
+
+
 class GitHubClient:
     ENDPOINT: str = "https://api.github.com"
     OAUTH_ENDPOINT: str = "https://github.com/login/oauth/access_token"
@@ -131,23 +140,172 @@
             f"repos/{repo_name}/pulls",
             token,
             state="closed",
-            sort="popularity",
+            sort="created",
             direction="desc",
             base=self._get(f"repos/{repo_name}", token).json()["default_branch"],
             per_page=per_page,
         ).json()
 
-    def list_review_comments(self, repo_name: str, token: Union[str, None] = None) -> List[Dict[str, Any]]:
-        # https://docs.github.com/en/rest/pulls/comments#list-review-comments-in-a-repository
-        comments = self._get(
-            f"repos/{repo_name}/pulls/comments",
+    def list_comments_from_issue(
+        self, issue_number: int, repo_name: str, token: Union[str, None] = None, **kwargs
+    ) -> List[Dict[str, Any]]:
+        # https://docs.github.com/en/rest/issues/comments#list-issue-comments
+        return [
+            comment
+            for comment in self._get(
+                f"repos/{repo_name}/issues/{issue_number}/comments",
+                token,
+                **kwargs,
+            ).json()
+            if comment["user"]["type"] == "User"
+        ]
+
+    def list_reviews_from_pull(
+        self, repo_name: str, pull_number: int, token: Union[str, None] = None, **kwargs
+    ) -> List[Dict[str, Any]]:
+        # https://docs.github.com/en/rest/pulls/reviews#list-reviews-for-a-pull-request
+        # Get comments (filter account type == user, & user != author)
+        return self._get(
+            f"repos/{repo_name}/pulls/{pull_number}/reviews",
             token,
-            sort="created_at",
-            direction="desc",
-            per_page=100,
+            **kwargs,
+        ).json()
+
+    def list_threads_from_review(
+        self, repo_name: str, pull_number: int, review_id: int, token: Union[str, None] = None, **kwargs
+    ) -> List[Dict[str, Any]]:
+        # https://docs.github.com/en/rest/pulls/reviews#list-reviews-for-a-pull-request
+        # Get comments (filter account type == user, & user != author)
+        return self._get(
+            f"repos/{repo_name}/pulls/{pull_number}/reviews/{review_id}/comments",
+            token,
+            **kwargs,
         ).json()
-        # Get comments (filter account type == user, & user != author) --> take diff_hunk, body, path
-        return [comment for comment in comments if comment["user"]["type"] == "User"]
+
+    def list_review_comments_from_pull(
+        self, pull_number: int, repo_name: str, token: Union[str, None] = None, **kwargs
+    ) -> List[List[Dict[str, Any]]]:
+        # https://docs.github.com/en/rest/pulls/comments#list-review-comments-on-a-pull-request
+        # Get comments (filter account type == user, & user != author)
+        return [
+            comment
+            for comment in self._get(
+                f"repos/{repo_name}/pulls/{pull_number}/comments",
+                token,
+                sort="created_at",
+                **kwargs,
+            ).json()
+            if comment["user"]["type"] == "User"
+        ]
+
+    def fetch_reviews_from_repo(
+        self, repo_name: str, num_pulls: int = 30, token: Union[str, None] = None, **kwargs
+    ) -> List[Dict[str, Any]]:
+        # Fetch pulls & filter them
+        pulls = self.list_pulls(repo_name, token, per_page=num_pulls)
+        # Fetch reviews from those (parallelize)
+        # reviews = [self.list_reviews_from_pull(repo_name, pull["number"], token, per_page=100) for pull in pulls]
+        # Fetch comments (parallelize)
+        comments = cast(
+            List[List[Dict[str, Any]]],
+            execute_in_parallel(
+                partial(self.list_review_comments_from_pull, repo_name=repo_name, token=token, per_page=100, **kwargs),
+                (pull["number"] for pull in pulls),
+                len(pulls),
+            ),
+        )
+        # Arrange them in threads
+        id_map = {
+            # diff_hunk, body, path, user/id, pull_request_url, reactions/total_count, in_reply_to_id, id, original_start_line, original_line
+            comment["id"]: {
+                "id": comment["id"],
+                "code": resolve_diff_section(
+                    comment["diff_hunk"],
+                    comment["original_start_line"] or comment["original_line"],
+                    comment["original_line"],
+                ),
+                "body": comment["body"],
+                "path": comment["path"],
+                "user_id": comment["user"]["id"],
+                "reactions_total_count": comment["reactions"]["total_count"],
+                "in_reply_to_id": comment.get("in_reply_to_id"),
+                "start_line": comment["original_start_line"] or comment["original_line"],
+                "end_line": comment["original_line"],
+                "commit_id": comment["commit_id"],
+            }
+            for pull in comments
+            for comment in pull
+        }
+        return [
+            {
+                "pull": {
+                    "number": pull["number"],
+                    "title": pull["title"],
+                    "body": pull["body"],
+                    "user_id": pull["user"]["id"],
+                },
+                "threads": [[id_map[_id] for _id in thread] for thread in self.arrange_in_threads(_comments)],
+            }
+            for pull, _comments in zip(pulls, comments)
+        ]
+
+    @staticmethod
+    def arrange_in_threads(comments: List[Dict[str, Any]]) -> List[List[int]]:
+        # Chain the threads together
+        unused_nodes = {comment["id"] for comment in comments}
+        prev_map = {comment["id"]: comment.get("in_reply_to_id") for comment in comments}
+        next_map = {
+            comment["in_reply_to_id"]: comment["id"]
+            for comment in comments
+            if isinstance(comment.get("in_reply_to_id"), int)
+        }
+
+        threads = []
+        while len(unused_nodes) > 0:
+            _id = next(iter(unused_nodes))
+            _thread = [_id]
+            unused_nodes.remove(_id)
+            while isinstance(next_map.get(_thread[-1]), int):
+                _thread.append(next_map[_thread[-1]])
+                unused_nodes.remove(next_map[_thread[-1]])
+            while isinstance(prev_map.get(_thread[0]), int):
+                _thread.insert(0, prev_map[_thread[0]])
+                unused_nodes.remove(prev_map[_thread[0]])
+            threads.append(_thread)
+
+        return sorted(threads, key=itemgetter(0))
+
+    def fetch_pull_comments_from_repo(
+        self, repo_name: str, num_pulls: int = 30, token: Union[str, None] = None, **kwargs
+    ) -> List[Dict[str, Any]]:
+        # Fetch pulls & filter them
+        pulls = self.list_pulls(repo_name, token, per_page=num_pulls)
+        # Fetch comments from those (parallelize)
+        comments = execute_in_parallel(
+            partial(self.list_comments_from_issue, repo_name=repo_name, token=token, per_page=100, **kwargs),
+            (pull["number"] for pull in pulls),
+            len(pulls),
+        )
+        return [
+            {
+                "pull": {
+                    "number": pull["number"],
+                    "title": pull["title"],
+                    "body": pull["body"],
+                    "user_id": pull["user"]["id"],
+                },
+                "comments": [
+                    {
+                        "id": _comment["id"],
+                        "body": _comment["body"],
+                        "user_id": _comment["user"]["id"],
+                        "reactions_total_count": _comment["reactions"]["total_count"],
+                    }
+                    for _comment in _comments
+                ],
+            }
+            for pull, _comments in zip(pulls, comments)
+        ]
 
 
 gh_client = GitHubClient(settings.GH_TOKEN)