Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

feat: Improves guideline extraction pipeline #41

Merged
merged 18 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 52 additions & 13 deletions src/app/api/api_v1/endpoints/repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
from base64 import b64decode
from datetime import datetime
from functools import partial
from typing import List, cast

from fastapi import APIRouter, Depends, HTTPException, Path, Security, status
Expand All @@ -20,6 +21,7 @@
from app.services.openai import openai_client
from app.services.slack import slack_client
from app.services.telemetry import telemetry_client
from app.services.utils import execute_in_parallel

logger = logging.getLogger("uvicorn.error")
router = APIRouter()
Expand Down Expand Up @@ -177,26 +179,63 @@
telemetry_client.capture(user.id, event="repo-parse-guidelines", properties={"repo_id": repo_id})
# Sanity check
repo = cast(Repository, await repos.get(repo_id, strict=True))
# STATIC CONTENT
# Stage all the text sources
sources = []

Check warning on line 183 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L183

Added line #L183 was not covered by tests
# Parse CONTRIBUTING (README if CONTRIBUTING doesn't exist)
contributing = gh_client.get_file(repo.full_name, "CONTRIBUTING.md", payload.github_token)
# readme = gh_client.get_readme(payload.github_token)
# diff_hunk, body, path
# comments = gh_client.list_review_comments(payload.github_token)
readme = gh_client.get_readme(repo.full_name, payload.github_token) if contributing is None else None
if contributing is not None:
sources.append((contributing["path"], b64decode(contributing["content"]).decode()))
if readme is not None:
sources.append((readme["path"], b64decode(readme["content"]).decode()))

Check warning on line 190 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L186-L190

Added lines #L186 - L190 were not covered by tests
# Pull request comments (!= review comments/threads)
pull_comments = [

Check warning on line 192 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L192

Added line #L192 was not covered by tests
pull
for pull in gh_client.fetch_pull_comments_from_repo(repo.full_name, token=payload.github_token)
if len(pull["comments"]) > 0
]
if len(pull_comments) > 0:

Check warning on line 197 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L197

Added line #L197 was not covered by tests
# Keep: body, user/id, reactions/total_count
corpus = "# Pull request comments\n\n\n\n\n\n".join([

Check warning on line 199 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L199

Added line #L199 was not covered by tests
f"PULL REQUEST {pull['pull']['number']} from user {pull['pull']['user_id']}\n\n"
+ "\n\n".join(f"[User {comment['user_id']}] {comment['body']}" for comment in pull["comments"])
for pull in pull_comments
])
sources.append(("pull_request_comments", corpus))

Check warning on line 204 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L204

Added line #L204 was not covered by tests
# Review threads
review_comments = [

Check warning on line 206 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L206

Added line #L206 was not covered by tests
pull
for pull in gh_client.fetch_reviews_from_repo(repo.full_name, token=payload.github_token)
if len(pull["threads"]) > 0
]
# Ideas: filter on pulls with highest amount of comments recently, add the review output rejection/etc
if len(review_comments) > 0:

Check warning on line 212 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L212

Added line #L212 was not covered by tests
# Keep: code, body, user/id, reactions/total_count
corpus = "# Code review history\n\n\n\n\n\n".join([

Check warning on line 214 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L214

Added line #L214 was not covered by tests
f"PULL: {pull['pull']['number']} from user {pull['pull']['user_id']}\n\n"
+ "\n\n".join(
f"[Code diff]\n```{thread[0]['code']}\n```\n"
+ "\n".join(f"[User {comment['user_id']}] {comment['body']}" for comment in thread)
for thread in pull["threads"]
)
for pull in review_comments
])
sources.append(("review_comments", corpus))

Check warning on line 223 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L223

Added line #L223 was not covered by tests
# If not enough information, raise error
if contributing is None:
if len(sources) == 0:

Check warning on line 225 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L225

Added line #L225 was not covered by tests
raise HTTPException(status.HTTP_404_NOT_FOUND, detail="No useful information is accessible in the repository")
# Analyze with LLM
contributing_guidelines = openai_client.parse_guidelines_from_text(
b64decode(contributing["content"]).decode(),
user_id=str(user.id),
# Process all sources in parallel
responses = execute_in_parallel(

Check warning on line 228 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L228

Added line #L228 was not covered by tests
partial(openai_client.parse_guidelines_from_text, user_id=str(user.id)),
(corpus for _, corpus in sources),
num_threads=len(sources),
)
# contributing_guidelines = ollama_client.parse_guidelines_from_text(b64decode(contributing["content"]).decode())
return [
ParsedGuideline(**guideline.dict(), repo_id=repo_id, origin_path=contributing["path"])
for guideline in contributing_guidelines
guidelines = [

Check warning on line 233 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L233

Added line #L233 was not covered by tests
ParsedGuideline(**guideline.dict(), repo_id=repo_id, source=source)
for (source, _), response in zip(sources, responses)
for guideline in response
]
return guidelines

Check warning on line 238 in src/app/api/api_v1/endpoints/repos.py

View check run for this annotation

Codecov / codecov/patch

src/app/api/api_v1/endpoints/repos.py#L238

Added line #L238 was not covered by tests


@router.post("/{repo_id}/waitlist", status_code=status.HTTP_200_OK, summary="Add a GitHub repository to the waitlist")
Expand Down
2 changes: 1 addition & 1 deletion src/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def sqlachmey_uri(cls, v: str) -> str:
JWT_ENCODING_ALGORITHM: str = "HS256"
# Compute
OPENAI_API_KEY: str = os.environ["OPENAI_API_KEY"]
OPENAI_MODEL: OpenAIModel = OpenAIModel.GPT4_TURBO
OPENAI_MODEL: OpenAIModel = OpenAIModel.GPT3_5_TURBO

# Error monitoring
SENTRY_DSN: Union[str, None] = os.environ.get("SENTRY_DSN")
Expand Down
4 changes: 2 additions & 2 deletions src/app/schemas/guidelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ class GuidelineExample(BaseModel):


class GuidelineContent(BaseModel):
title: str = Field(..., min_length=6, max_length=100)
title: str = Field(..., min_length=3, max_length=100)
details: str = Field(..., min_length=6, max_length=1000)


class ParsedGuideline(GuidelineContent):
repo_id: int = Field(..., gt=0)
origin_path: str
source: str


class GuidelineLocation(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/app/schemas/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class ChatCompletion(BaseModel):
frequency_penalty: float = 1.0
response_format: _ResponseFormat = _ResponseFormat(type="json_object")
user: Union[str, None] = None
# seed: int = 42
seed: int = 42


class GHTokenRequest(BaseModel):
Expand Down
180 changes: 169 additions & 11 deletions src/app/services/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0> for full license details.

import logging
from typing import Any, Dict, List, Union
from functools import partial
from operator import itemgetter
from typing import Any, Dict, List, Union, cast

import requests
from fastapi import HTTPException, status
Expand All @@ -13,12 +15,19 @@
from app.core.config import settings
from app.models import User, UserScope
from app.schemas.services import GHToken, GHTokenRequest
from app.services.utils import execute_in_parallel

logger = logging.getLogger("uvicorn.error")

__all__ = ["gh_client"]


def resolve_diff_section(diff_hunk: str, first_line: int, last_line: int) -> str:
"""Assumes the diff_hunk's last line is the last_line"""
num_lines = last_line - first_line + 1
return "\n".join(diff_hunk.split("\n")[-num_lines:])

Check warning on line 28 in src/app/services/github.py

View check run for this annotation

Codecov / codecov/patch

src/app/services/github.py#L27-L28

Added lines #L27 - L28 were not covered by tests


class GitHubClient:
ENDPOINT: str = "https://api.github.com"
OAUTH_ENDPOINT: str = "https://github.com/login/oauth/access_token"
Expand Down Expand Up @@ -131,23 +140,172 @@
f"repos/{repo_name}/pulls",
token,
state="closed",
sort="popularity",
sort="created",
direction="desc",
base=self._get(f"repos/{repo_name}", token).json()["default_branch"],
per_page=per_page,
).json()

def list_review_comments(self, repo_name: str, token: Union[str, None] = None) -> List[Dict[str, Any]]:
# https://docs.github.com/en/rest/pulls/comments#list-review-comments-in-a-repository
comments = self._get(
f"repos/{repo_name}/pulls/comments",
def list_comments_from_issue(
self, issue_number: int, repo_name: str, token: Union[str, None] = None, **kwargs
) -> List[Dict[str, Any]]:
# https://docs.github.com/en/rest/issues/comments#list-issue-comments
return [
comment
for comment in self._get(
f"repos/{repo_name}/issues/{issue_number}/comments",
token,
**kwargs,
).json()
if comment["user"]["type"] == "User"
]

def list_reviews_from_pull(
self, repo_name: str, pull_number: int, token: Union[str, None] = None, **kwargs
) -> List[Dict[str, Any]]:
# https://docs.github.com/en/rest/pulls/reviews#list-reviews-for-a-pull-request
# Get comments (filter account type == user, & user != author)
return self._get(
f"repos/{repo_name}/pulls/{pull_number}/reviews",
token,
sort="created_at",
direction="desc",
per_page=100,
**kwargs,
).json()

def list_threads_from_review(
self, repo_name: str, pull_number: int, review_id: int, token: Union[str, None] = None, **kwargs
) -> List[Dict[str, Any]]:
# https://docs.github.com/en/rest/pulls/reviews#list-reviews-for-a-pull-request
# Get comments (filter account type == user, & user != author)
return self._get(

Check warning on line 179 in src/app/services/github.py

View check run for this annotation

Codecov / codecov/patch

src/app/services/github.py#L179

Added line #L179 was not covered by tests
f"repos/{repo_name}/pulls/{pull_number}/reviews/{review_id}/comments",
token,
**kwargs,
).json()
# Get comments (filter account type == user, & user != author) --> take diff_hunk, body, path
return [comment for comment in comments if comment["user"]["type"] == "User"]

def list_review_comments_from_pull(
self, pull_number: int, repo_name: str, token: Union[str, None] = None, **kwargs
) -> List[List[Dict[str, Any]]]:
# https://docs.github.com/en/rest/pulls/comments#list-review-comments-on-a-pull-request
# Get comments (filter account type == user, & user != author)
return [
comment
for comment in self._get(
f"repos/{repo_name}/pulls/{pull_number}/comments",
token,
sort="created_at",
**kwargs,
).json()
if comment["user"]["type"] == "User"
]

def fetch_reviews_from_repo(
self, repo_name: str, num_pulls: int = 30, token: Union[str, None] = None, **kwargs
) -> List[Dict[str, Any]]:
# Fetch pulls & filter them
pulls = self.list_pulls(repo_name, token, per_page=num_pulls)
# Fetch reviews from those (parallelize)
# reviews = [self.list_reviews_from_pull(repo_name, pull["number"], token, per_page=100) for pull in pulls]
# Fetch comments (parallelize)
comments = cast(
List[List[Dict[str, Any]]],
execute_in_parallel(
partial(self.list_review_comments_from_pull, repo_name=repo_name, token=token, per_page=100, **kwargs),
(pull["number"] for pull in pulls),
len(pulls),
),
)
# Arrange them in threads
id_map = {
# diff_hunk, body, path, user/id, pull_request_url, reactions/total_count, in_reply_to_id, id, original_start_line, original_line
comment["id"]: {
"id": comment["id"],
"code": resolve_diff_section(
comment["diff_hunk"],
comment["original_start_line"] or comment["original_line"],
comment["original_line"],
),
"body": comment["body"],
"path": comment["path"],
"user_id": comment["user"]["id"],
"reactions_total_count": comment["reactions"]["total_count"],
"in_reply_to_id": comment.get("in_reply_to_id"),
"start_line": comment["original_start_line"] or comment["original_line"],
"end_line": comment["original_line"],
"commit_id": comment["commit_id"],
}
for pull in comments
for comment in pull
}
return [
{
"pull": {
"number": pull["number"],
"title": pull["title"],
"body": pull["body"],
"user_id": pull["user"]["id"],
},
"threads": [[id_map[_id] for _id in thread] for thread in self.arrange_in_threads(_comments)],
}
for pull, _comments in zip(pulls, comments)
]

@staticmethod
def arrange_in_threads(comments: List[Dict[str, Any]]) -> List[List[int]]:
# Chain the threads together
unused_nodes = {comment["id"] for comment in comments}
prev_map = {comment["id"]: comment.get("in_reply_to_id") for comment in comments}
next_map = {

Check warning on line 257 in src/app/services/github.py

View check run for this annotation

Codecov / codecov/patch

src/app/services/github.py#L255-L257

Added lines #L255 - L257 were not covered by tests
comment["in_reply_to_id"]: comment["id"]
for comment in comments
if isinstance(comment.get("in_reply_to_id"), int)
}

threads = []
while len(unused_nodes) > 0:
_id = next(iter(unused_nodes))
_thread = [_id]
unused_nodes.remove(_id)
while isinstance(next_map.get(_thread[-1]), int):
_thread.append(next_map[_thread[-1]])
unused_nodes.remove(next_map[_thread[-1]])
while isinstance(prev_map.get(_thread[0]), int):
_thread.insert(0, prev_map[_thread[0]])
unused_nodes.remove(prev_map[_thread[0]])
threads.append(_thread)

Check warning on line 274 in src/app/services/github.py

View check run for this annotation

Codecov / codecov/patch

src/app/services/github.py#L263-L274

Added lines #L263 - L274 were not covered by tests

return sorted(threads, key=itemgetter(0))

Check warning on line 276 in src/app/services/github.py

View check run for this annotation

Codecov / codecov/patch

src/app/services/github.py#L276

Added line #L276 was not covered by tests

def fetch_pull_comments_from_repo(
self, repo_name: str, num_pulls: int = 30, token: Union[str, None] = None, **kwargs
) -> List[Dict[str, Any]]:
# Fetch pulls & filter them
pulls = self.list_pulls(repo_name, token, per_page=num_pulls)
# Fetch comments from those (parallelize)
comments = execute_in_parallel(
partial(self.list_comments_from_issue, repo_name=repo_name, token=token, per_page=100, **kwargs),
(pull["number"] for pull in pulls),
len(pulls),
)
return [
{
"pull": {
"number": pull["number"],
"title": pull["title"],
"body": pull["body"],
"user_id": pull["user"]["id"],
},
"comments": [
{
"id": _comment["id"],
"body": _comment["body"],
"user_id": _comment["user"]["id"],
"reactions_total_count": _comment["reactions"]["total_count"],
}
for _comment in _comments
],
}
for pull, _comments in zip(pulls, comments)
]


gh_client = GitHubClient(settings.GH_TOKEN)
Loading
Loading