Skip to content

Commit

Permalink
Merge pull request #6 from TogetherCrew/feature/files
Browse files Browse the repository at this point in the history
feat: two functions were added to extract commits and prs files
  • Loading branch information
cyri113 authored Dec 12, 2023
2 parents 0497539 + 6cbcbba commit 0e82b13
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 5 deletions.
76 changes: 76 additions & 0 deletions dags/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@
get_all_reviews_of_pull_request,
get_all_repo_review_comments,
get_all_repo_contributors,
get_all_pull_request_files,
get_all_org_members,
get_all_repo_labels,
fetch_commit_files,
)
from neo4j_storage import (
save_orgs_to_neo4j, save_repo_to_neo4j,
save_commit_files_changes_to_neo4j,
save_repo_contributors_to_neo4j,
save_pr_files_changes_to_neo4j,
save_review_comment_to_neo4j,
get_orgs_profile_from_neo4j,
save_pull_request_to_neo4j,
Expand Down Expand Up @@ -173,6 +177,37 @@ def load_pull_requests(data):
return data
#endregion

#region pull request files changes ETL
@task
def extract_pull_request_files_changes(data):
repo = data['repo']
owner = repo['owner']['login']
repo_name = repo['name']
prs = data['prs']

pr_files_changes = {}
for pr in prs:
files_changes = get_all_pull_request_files(owner= owner, repo= repo_name, pull_number= pr.get('number', None))
pr_files_changes[pr['id']] = files_changes

return { "pr_files_changes": pr_files_changes, **data }

@task
def transform_pull_request_files_changes(data):
return data

@task
def load_pull_request_files_changes(data):
pr_files_changes = data['pr_files_changes']
repository_id = data['repo']['id']

for pr_id, files_changes in pr_files_changes.items():
save_pr_files_changes_to_neo4j(pr_id= pr_id, repository_id= repository_id, file_changes= files_changes)

return data

#endregion

#region pr review ETL
@task
def extract_pr_review(data):
Expand Down Expand Up @@ -370,6 +405,38 @@ def load_commits(data):

#endregion

#region commits files changes ETL
@task
def extract_commits_files_changes(data):
repo = data['repo']
owner = repo['owner']['login']
repo_name = repo['name']
commits = data['commits']

commits_files_changes = {}
for commit in commits:
sha = commit['sha']
files_changes = fetch_commit_files(owner= owner, repo= repo_name, sha= sha)
commits_files_changes[sha] = files_changes

return { "commits_files_changes": commits_files_changes, **data }

@task
def transform_commits_files_changes(data):
return data

@task
def load_commits_files_changes(data):
commits_files_changes = data['commits_files_changes']
repository_id = data['repo']['id']

for sha, files_changes in commits_files_changes.items():
save_commit_files_changes_to_neo4j(commit_sha= sha, repository_id= repository_id, file_changes= files_changes)

return data

#endregion

orgs = get_all_organization()
orgs_info = extract_github_organization.expand(organization= orgs)
transform_orgs = transform_github_organization.expand(organization= orgs_info)
Expand Down Expand Up @@ -400,6 +467,10 @@ def load_commits(data):
load_contributors >> load_prs
load_label >> load_prs

pr_files_changes = extract_pull_request_files_changes.expand(data= prs)
transform_pr_files_changes = transform_pull_request_files_changes.expand(data= pr_files_changes)
load_pr_files_changes = load_pull_request_files_changes.expand(data= transform_pr_files_changes)

issues = extract_issues.expand(data= repos)
transform_issue = transform_issues.expand(data= issues)
load_issue = load_issues.expand(data= transform_issue)
Expand All @@ -425,3 +496,8 @@ def load_commits(data):
transform_comment = transform_commits.expand(data= commits)
load_comment = load_commits.expand(data= transform_comment)

commits_files_changes = extract_commits_files_changes.expand(data= commits)
transform_commits_files_changes = transform_commits_files_changes.expand(data= commits_files_changes)
load_commits_files_changes = load_commits_files_changes.expand(data= transform_commits_files_changes)
load_comment >> load_commits_files_changes
load_pr_files_changes >> load_commits_files_changes
4 changes: 2 additions & 2 deletions dags/github_api_helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from .repos import get_all_org_repos, get_all_repo_contributors
from .commits import get_all_commits, fetch_commit_details
from .commits import get_all_commits, fetch_commit_details, fetch_commit_files
from .issues import get_all_issues, get_all_comments_of_issue
from .pull_requests import (get_all_pull_requests, get_all_commits_of_pull_request,
get_all_comments_of_pull_request, get_all_review_comments_of_pull_request,
get_all_reactions_of_review_comment, get_all_reactions_of_comment,
get_all_reviews_of_pull_request)
get_all_reviews_of_pull_request, get_all_pull_request_files)
from .orgs import fetch_org_details, get_all_org_members
from .labels import get_all_repo_labels
from .comments import get_all_repo_review_comments, get_all_repo_issues_and_prs_comments
15 changes: 15 additions & 0 deletions dags/github_api_helpers/commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,18 @@ def fetch_commit_details(owner: str, repo: str, commit_sha: str):
endpoint = f'https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}'
response = get(endpoint)
return response.json()

def fetch_commit_files(owner: str, repo: str, sha: str):
"""
Retrieves the files changed in a specific commit of a GitHub repository.
:param owner: The owner of the repository.
:param repo: The name of the repository.
:param sha: The SHA identifier of the commit.
:return: A list of files changed in the specified commit.
"""
commit_details = fetch_commit_details(owner, repo, sha)
if 'files' in commit_details:
return commit_details['files']
else:
return []
44 changes: 44 additions & 0 deletions dags/github_api_helpers/pull_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def get_all_reactions_of_comment(owner: str, repo: str, comment_id: int):
current_page += 1
return all_reactions


def fetch_pull_request_reviews(owner: str, repo: str, pull_number: int, page: int, per_page: int = 100):
"""
Fetches the reviews for a specific pull request page by page.
Expand Down Expand Up @@ -271,3 +272,46 @@ def get_all_reviews_of_pull_request(owner: str, repo: str, pull_number: int):
all_reviews.extend(reviews)
current_page += 1
return all_reviews


def fetch_pull_request_files_page(owner: str, repo: str, pull_number: int, page: int, per_page: int = 100):
"""
Fetches the files of a specific pull request in a GitHub repository.
:param owner: The owner of the repository.
:param repo: The name of the repository.
:param pull_number: The number of the pull request.
:param page: The page number of the results.
:param per_page: The number of results per page (default is 30).
:return: A list of files for the specified pull request.
"""
endpoint = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}/files'

params = {
"per_page": per_page,
"page": page
}
response = get(endpoint, params=params)
response_data = response.json()

return response_data

def get_all_pull_request_files(owner: str, repo: str, pull_number: int):
"""
Retrieves all files for a specified pull request in a GitHub repository.
:param owner: The owner of the repository.
:param repo: The name of the repository.
:param pull_number: The number of the pull request.
:return: A list of all files for the specified pull request.
"""
files = []
page = 1
while True:
page_files = fetch_pull_request_files_page(owner, repo, pull_number, page)
if not page_files:
break
files.extend(page_files)
page += 1

return files
4 changes: 2 additions & 2 deletions dags/neo4j_storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .orgs import save_orgs_to_neo4j, save_org_member_to_neo4j, get_orgs_profile_from_neo4j
from .repos import save_repo_to_neo4j, save_repo_contributors_to_neo4j
from .pull_requests import save_pull_request_to_neo4j, save_review_to_neo4j
from .pull_requests import save_pull_request_to_neo4j, save_review_to_neo4j, save_pr_files_changes_to_neo4j
from .issues import save_issue_to_neo4j
from .labels import save_label_to_neo4j
from .commits import save_commit_to_neo4j
from .commits import save_commit_to_neo4j, save_commit_files_changes_to_neo4j
from .comments import save_review_comment_to_neo4j, save_comment_to_neo4j
20 changes: 20 additions & 0 deletions dags/neo4j_storage/commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,23 @@ def save_commit_to_neo4j(commit: dict, repository_id: str):

driver.close()

def save_commit_files_changes_to_neo4j(commit_sha: str, repository_id: str, file_changes: list):

neo4jConnection = Neo4jConnection()
driver = neo4jConnection.connect_neo4j()

with driver.session() as session:
session.execute_write(lambda tx:
tx.run(f"""
MATCH (repo:{Node.Repository.value} {{id: $repository_id}}), (c:{Node.Commit.value} {{sha: $commit_sha}})
WITH repo, c
UNWIND $file_changes AS file_change
MERGE (f:{Node.File.value} {{sha: file_change.sha, filename: file_change.filename}})
SET f += file_change, f.latestSavedAt = datetime()
MERGE (c)-[fc:{Relationship.CHANGED.value}]->(f)
SET fc.latestSavedAt = datetime()
MERGE (f)-[io:{Relationship.IS_ON.value}]->(repo)
SET io.latestSavedAt = datetime()
""", commit_sha= commit_sha, repository_id= repository_id, file_changes= file_changes))

driver.close()
2 changes: 2 additions & 0 deletions dags/neo4j_storage/neo4j_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Node(Enum):
Commit = "Commit"
Comment = "Comment"
ReviewComment = "ReviewComment"
File = "File"

class Relationship(Enum):
IS_MEMBER = "IS_MEMBER"
Expand All @@ -22,3 +23,4 @@ class Relationship(Enum):
HAS_LABEL = "HAS_LABEL"
COMMITTED = "COMMITTED"
IS_ON = "IS_ON"
CHANGED = "CHANGED"
27 changes: 26 additions & 1 deletion dags/neo4j_storage/pull_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,29 @@ def save_review_to_neo4j(pr_id: dict, review: dict):
""", pr_id= int(pr_id), author= author, review= review)
)

driver.close()
driver.close()

def save_pr_files_changes_to_neo4j(pr_id: int, repository_id: str, file_changes: list):

neo4jConnection = Neo4jConnection()
driver = neo4jConnection.connect_neo4j()

print(f"MATCH (repo:{Node.Repository.value} {{id: $repository_id}}), (pr:{Node.PullRequest.value} {{id: $pr_id}})")
print("repository_id", repository_id)
print("pr_id", pr_id)

with driver.session() as session:
session.execute_write(lambda tx:
tx.run(f"""
MATCH (repo:{Node.Repository.value} {{id: $repository_id}}), (pr:{Node.PullRequest.value} {{id: $pr_id}})
WITH repo, pr
UNWIND $file_changes AS file_change
MERGE (f:{Node.File.value} {{sha: file_change.sha, filename: file_change.filename}})
SET f += file_change, f.latestSavedAt = datetime()
MERGE (pr)-[fc:{Relationship.CHANGED.value}]->(f)
SET fc.latestSavedAt = datetime()
MERGE (f)-[io:{Relationship.IS_ON.value}]->(repo)
SET io.latestSavedAt = datetime()
""", pr_id= int(pr_id), repository_id= int(repository_id), file_changes= file_changes))

driver.close()

0 comments on commit 0e82b13

Please sign in to comment.