diff --git a/dags/github.py b/dags/github.py index 1e949097..2d1bca37 100644 --- a/dags/github.py +++ b/dags/github.py @@ -24,6 +24,7 @@ from airflow import DAG from airflow.decorators import task from github.github_api_helpers import ( + extract_linked_issues_from_pr, fetch_commit_files, fetch_org_details, get_all_commits, @@ -181,6 +182,25 @@ def extract_pull_requests(data): new_data = {"prs": prs, **data} return new_data + @task + def extract_pull_request_linked_issues(data): + logging.info(f"All data from last stage: {data}") + repo = data["repo"] + prs = data["prs"] + owner = repo["owner"]["login"] + repo_name = repo["name"] + + new_prs = [] + for pr in prs: + pr_number = pr["number"] + linked_issues = extract_linked_issues_from_pr( + owner=owner, repo=repo_name, pull_number=pr_number + ) + new_prs.append({**pr, "linked_issues": linked_issues}) + + new_data = {**data, "prs": new_prs} + return new_data + @task def transform_pull_requests(data): logging.info(f"All data from last stage: {data}") @@ -515,11 +535,19 @@ def load_commits_files_changes(data): transform_label = transform_labels.expand(data=labels) load_label = load_labels.expand(data=transform_label) + issues = extract_issues.expand(data=repos) + transform_issue = transform_issues.expand(data=issues) + load_issue = load_issues.expand(data=transform_issue) + load_contributors >> load_issue + load_label >> load_issue + prs = extract_pull_requests.expand(data=repos) - transform_prs = transform_pull_requests.expand(data=prs) + prs_linked_issues = extract_pull_request_linked_issues.expand(data=prs) + transform_prs = transform_pull_requests.expand(data=prs_linked_issues) load_prs = load_pull_requests.expand(data=transform_prs) load_contributors >> load_prs load_label >> load_prs + load_issue >> load_prs pr_files_changes = extract_pull_request_files_changes.expand(data=prs) transform_pr_files_changes = transform_pull_request_files_changes.expand( @@ -529,12 +557,6 @@ def load_commits_files_changes(data): data=transform_pr_files_changes ) - issues = extract_issues.expand(data=repos) - transform_issue = transform_issues.expand(data=issues) - load_issue = load_issues.expand(data=transform_issue) - load_contributors >> load_issue - load_label >> load_issue - pr_reviews = extract_pr_review.expand(data=prs) transform_pr_review = transform_pr_review.expand(data=pr_reviews) load_pr_review = load_pr_review.expand(data=transform_pr_review) diff --git a/dags/github/github_api_helpers/__init__.py b/dags/github/github_api_helpers/__init__.py index f709f6e5..52f9326d 100644 --- a/dags/github/github_api_helpers/__init__.py +++ b/dags/github/github_api_helpers/__init__.py @@ -5,6 +5,7 @@ from .labels import get_all_repo_labels from .orgs import fetch_org_details, get_all_org_members from .pull_requests import ( + extract_linked_issues_from_pr, get_all_comments_of_pull_request, get_all_commits_of_pull_request, get_all_pull_request_files, diff --git a/dags/github/github_api_helpers/issues.py b/dags/github/github_api_helpers/issues.py index 44d6f4c6..3ae41746 100644 --- a/dags/github/github_api_helpers/issues.py +++ b/dags/github/github_api_helpers/issues.py @@ -3,6 +3,25 @@ from .smart_proxy import get +def fetch_issue(owner: str, repo: str, issue_number: int): + """ + Fetches a specific issue from a GitHub repository. + + :param owner: The owner of the repository. + :param repo: The name of the repository. + :param issue_number: The number of the issue. + :return: The issue data. + """ + endpoint = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}" + response = get(endpoint) + response_data = response.json() + + logging.info( + f"Fetched issue {issue_number} for {owner}/{repo}. Issue: {response_data}" + ) + return response_data + + # Issues def fetch_issues(owner: str, repo: str, page: int, per_page: int = 100): """ diff --git a/dags/github/github_api_helpers/pull_requests.py b/dags/github/github_api_helpers/pull_requests.py index 8a86a4fb..3463f81e 100644 --- a/dags/github/github_api_helpers/pull_requests.py +++ b/dags/github/github_api_helpers/pull_requests.py @@ -1,5 +1,8 @@ import logging +from bs4 import BeautifulSoup + +from .issues import fetch_issue from .smart_proxy import get @@ -58,6 +61,38 @@ def get_all_pull_requests(owner: str, repo: str): return all_pull_requests +def extract_issue_info_from_url(url): + splitted_url = url.split("/") + owner = splitted_url[-4] + repo = splitted_url[-3] + issue_number = splitted_url[-1] + + return {"owner": owner, "repo": repo, "issue_number": issue_number} + + +def extract_linked_issues_from_pr(owner: str, repo: str, pull_number: int): + html_pr_url = f"https://github.com/{owner}/{repo}/pull/{pull_number}" + response = get(html_pr_url) + linked_issue = [] + + soup = BeautifulSoup(response.text, "html.parser") + html_linked_issues = soup.find_all( + "span", + class_="Truncate truncate-with-responsive-width my-1", + attrs={"data-view-component": "true"}, + ) + for html_linked_issue in html_linked_issues: + issue_url = html_linked_issue.find("a").get("href") + issue_data = extract_issue_info_from_url(issue_url) + issue_info = fetch_issue( + issue_data["owner"], issue_data["repo"], issue_data["issue_number"] + ) + + linked_issue.append(issue_info) + + return linked_issue + + def fetch_pull_requests_commits( owner: str, repo: str, pull_number: int, page: int, per_page: int = 100 ): diff --git a/dags/github/neo4j_storage/neo4j_enums.py b/dags/github/neo4j_storage/neo4j_enums.py index 4c7cd974..813bc468 100644 --- a/dags/github/neo4j_storage/neo4j_enums.py +++ b/dags/github/neo4j_storage/neo4j_enums.py @@ -28,3 +28,4 @@ class Relationship(Enum): AUTHORED_BY = "AUTHORED_BY" IS_ON = "IS_ON" CHANGED = "CHANGED" + LINKED = "LINKED" diff --git a/dags/github/neo4j_storage/pull_requests.py b/dags/github/neo4j_storage/pull_requests.py index 86492b4a..588ee259 100644 --- a/dags/github/neo4j_storage/pull_requests.py +++ b/dags/github/neo4j_storage/pull_requests.py @@ -12,6 +12,7 @@ def save_pull_request_to_neo4j(pr: dict, repository_id: str): assignees = pr.pop("assignees", None) requested_reviewers = pr.pop("requested_reviewers", None) labels = pr.pop("labels", None) + linked_issues = pr.pop("linked_issues", None) cleaned_pr = remove_nested_collections(pr) if assignee: @@ -56,6 +57,16 @@ def save_pull_request_to_neo4j(pr: dict, repository_id: str): SET haslb.latestSavedAt = datetime() """ + linked_issues_query = f""" + WITH pr + UNWIND $linked_issues as linked_issue + MERGE (i:{Node.Issue.value} {{id: linked_issue.id}}) + SET i += linked_issue, i.latestSavedAt = datetime() + WITH pr, i + MERGE (pr)-[islinked:{Relationship.LINKED.value}]->(i) + SET islinked.latestSavedAt = datetime() + """ + with driver.session() as session: session.execute_write( lambda tx: tx.run( @@ -74,6 +85,7 @@ def save_pull_request_to_neo4j(pr: dict, repository_id: str): { assignees_query } { requested_reviewers_query } { labels_query } + { linked_issues_query } """, pr=cleaned_pr, @@ -83,6 +95,7 @@ def save_pull_request_to_neo4j(pr: dict, repository_id: str): assignees=assignees, labels=labels, requested_reviewers=requested_reviewers, + linked_issues=linked_issues, ) ) driver.close() diff --git a/requirements.txt b/requirements.txt index f5b8d815..3ce159ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 tc-hivemind-backend==1.1.3 traceloop-sdk==0.9.4 +beautifulsoup4==4.12.3