From 5c2f7dfae423c50947b82354f3e38ec89147aa92 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 1 Jul 2024 08:59:54 -0400 Subject: [PATCH] discos request timeout --- llm_github/core.py | 393 ++++++++++++++++++++++++++++++++++++++++++++ llm_github/foo.py | 17 -- local/.env.template | 2 +- poetry.lock | 104 +++++++++++- pyproject.toml | 2 + tests/test_foo.py | 2 +- 6 files changed, 500 insertions(+), 20 deletions(-) create mode 100644 llm_github/core.py delete mode 100644 llm_github/foo.py diff --git a/llm_github/core.py b/llm_github/core.py new file mode 100644 index 0000000..35a14ea --- /dev/null +++ b/llm_github/core.py @@ -0,0 +1,393 @@ +import json +import os +import time +from typing import Dict, List, Optional + +import requests +from dotenv import load_dotenv +from requests_cache import CachedSession, SQLiteCache + +REQUESTS_TIMEOUT = 10 # Timeout in seconds for requests + +# Default fields to be dropped from responses +DEFAULT_DROPPED_FIELDS = [ + "_links", + "base", + "comments_url", + "commits_url", + "diff_url", + "events_url", + "head", + "html_url", + "labels_url", + "locked", + "merge_commit_sha", + "node_id", + "patch_url", + "repository_url", + "review_comment_url", + "review_comments_url", + "statuses_url", + "timeline_url", +] + + +class EnvironmentVariableError(Exception): + """Exception raised for errors in the environment variables.""" + + def __init__(self, variable, message="is not set in the environment."): + self.variable = variable + self.message = message + super().__init__(f"{variable} {message}") + + +# Load environment variables from .env file +load_dotenv(dotenv_path="../local/.env", verbose=True) + +# Global access token for GitHub API +global_token = os.getenv("GITHUB_TOKEN") +if not global_token: + raise EnvironmentVariableError("GITHUB_TOKEN") +print("Token loaded successfully.") + +# Set up cache with SQLite backend +session = CachedSession( + cache_name="llm-github-cache", + backend=SQLiteCache("llm-github.sqlite", timeout=86400), # Cache expires after 24 hours +) + + +def get_rate_limit(token: str) -> Dict[str, int]: + """Fetch current rate limit status from GitHub API.""" + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/rate_limit", headers=headers, timeout=REQUESTS_TIMEOUT) + response.raise_for_status() # Raises HTTPError for bad requests + return response.json()["rate"] + + +def wait_for_rate_limit_reset(reset_time: int) -> None: + """Wait until the rate limit reset time.""" + wait_time = reset_time - int(time.time()) + 10 # Adding 10 seconds to ensure the reset has occurred + print(f"Rate limit exceeded. Waiting for {wait_time} seconds.") + time.sleep(wait_time) + + +def remove_keys_from_dict(data: Dict, keys_to_remove: List[str]) -> Dict: + """Remove specified keys from a dictionary.""" + return {key: value for key, value in data.items() if key not in keys_to_remove} + + +def write_json_to_file(json_object: List[Dict], filename: str) -> None: + """Save data to a JSON file.""" + with open(filename, "w", encoding="utf-8") as f: + json.dump(json_object, f, ensure_ascii=False, indent=4) + print(f"Data saved to {filename}") + + +def handle_response_errors(response: requests.Response) -> None: + """Handle HTTP errors from a response.""" + if response.status_code == 404: + print("Resource not found. Check the requested resource or permissions.") + elif response.status_code == 403: + print("Access forbidden. Ensure token has the required scopes or check for rate limits.") + elif response.status_code == 401: + print("Unauthorized. Check if the token is valid or expired.") + else: + print(f"Failed to fetch data. Status code: {response.status_code}") + print("Error message:", response.text) + + +def github_token_check(token: str) -> Optional[Dict]: + """Validate the GitHub token by fetching user profile.""" + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/user", headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + print("Token is valid. User data retrieved successfully.") + return response.json() + print(f"Failed to authenticate. Status code: {response.status_code}") + return None + + +def list_user_orgs(token: str) -> Optional[List[Dict]]: + """List all organizations the user is a member of.""" + rate_limit = get_rate_limit(token) + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + headers = {"Authorization": f"token {token}"} + response = session.get("https://api.github.com/user/orgs", headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + print("Organizations retrieved successfully.") + return response.json() + handle_response_errors(response) + return None + + +def get_repos(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all repositories for a given organization.""" + rate_limit = get_rate_limit(token) + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + repos = [] + url = f"https://api.github.com/orgs/{org}/repos" + headers = {"Authorization": f"token {token}"} + while url: + response = session.get(url, headers=headers, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + repos.extend(response.json()) + url = response.links.get("next", {}).get("url") + else: + handle_response_errors(response) + return None + return repos + + +def fetch_issues(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all issues from all repositories in an organization, handling pagination and rate limits.""" + issues = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + # Ensure the URL is constructed to fetch all issues (not just open ones) + url = repo["issues_url"].replace("{/number}", "?state=all") + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + issues.extend(response.json()) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch issues for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return issues + + +def sanitize_user_data(data: Dict) -> Dict: + """Recursively sanitize user data to keep only the user 'login'.""" + if isinstance(data, dict): + if "login" in data and set(data.keys()) - {"login"}: + return {"login": data["login"]} + else: + return {key: sanitize_user_data(value) for key, value in data.items()} + elif isinstance(data, list): + return [sanitize_user_data(item) for item in data] + return data + + +def remove_empty_values(data: Dict) -> Dict: + """Recursively remove keys with empty values from a dictionary or list.""" + if isinstance(data, dict): + return {k: remove_empty_values(v) for k, v in data.items() if v or isinstance(v, bool)} + elif isinstance(data, list): + return [remove_empty_values(item) for item in data if item or isinstance(item, bool)] + return data + + +def process_issues(issues: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of issues to sanitize user information and remove empty values.""" + processed_issues = [] + for issue in issues: + sanitized_issue = sanitize_user_data(issue) + cleaned_issue = remove_empty_values(sanitized_issue) + final_issue = remove_keys_from_dict(cleaned_issue, keys_to_remove) + processed_issues.append(final_issue) + return processed_issues + + +def fetch_pull_requests(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all pull requests from all repositories in an organization, handling pagination and rate limits.""" + pull_requests = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + url = f"{repo['url']}/pulls?state=all" + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + pull_requests.extend(response.json()) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch pull requests for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return pull_requests + + +def process_pull_requests(pull_requests: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of pull requests to sanitize user information and remove empty values.""" + processed_pull_requests = [] + for pr in pull_requests: + sanitized_pr = sanitize_user_data(pr) + cleaned_pr = remove_empty_values(sanitized_pr) + final_pr = remove_keys_from_dict(cleaned_pr, keys_to_remove) + processed_pull_requests.append(final_pr) + return processed_pull_requests + + +def fetch_all_comments(org: str, token: str) -> Optional[List[Dict]]: + """Fetch all comments from all repositories in an organization, distinguishing between issue and PR comments, while handling pagination and rate limits.""" + all_comments = [] + repos = get_repos(org, token) + if not repos: + print("No repositories found or failed to fetch repositories.") + return None + + for repo in repos: + url = f"{repo['url']}/issues/comments?per_page=100" # Adjusting per_page to fetch more comments per request if needed + while url: + rate_limit = get_rate_limit(token) # Check rate limit before each request + if rate_limit["remaining"] == 0: + wait_for_rate_limit_reset(rate_limit["reset"]) + + response = session.get(url, headers={"Authorization": f"token {token}"}, timeout=REQUESTS_TIMEOUT) + if response.status_code == 200: + comments = response.json() + for comment in comments: + if "pull_request" in comment: + comment["type"] = "pull_request" + else: + comment["type"] = "issue" + all_comments.extend(comments) + links = response.links + url = links["next"]["url"] if "next" in links else None + else: + print(f"Failed to fetch comments for {repo['name']}. Status code: {response.status_code}") + print("Error message:", response.text) + return None + return all_comments + + +def process_comments(comments: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of comments to sanitize user information and remove empty values.""" + processed_comments = [] + for comment in comments: + sanitized_comment = sanitize_user_data(comment) + cleaned_comment = remove_empty_values(sanitized_comment) + final_comment = remove_keys_from_dict(cleaned_comment, keys_to_remove) + processed_comments.append(final_comment) + return processed_comments + + +def fetch_all_discussions(org: str, token: str) -> Optional[List[Dict]]: + """Fetch discussions from all repositories in the specified organization.""" + all_discussions = [] + repos = get_repos(org, token) + if repos: + for repo in repos: + repo_name = repo["name"] if isinstance(repo, dict) else repo + print(f"Fetching discussions for repository: {repo_name}") + discussions = fetch_discussions_graphql(org, repo_name, token) + if discussions: + all_discussions.extend(discussions) + else: + print(f"No discussions found or an error occurred for repository: {repo_name}") + return all_discussions + + +def fetch_discussions_graphql(org: str, repo: str, token: str) -> Optional[List[Dict]]: + """Fetch discussions using GitHub's GraphQL API.""" + url = "https://api.github.com/graphql" + headers = {"Authorization": f"Bearer {token}"} + query = """ + query FetchDiscussions($org: String!, $repo: String!) { + repository(owner: $org, name: $repo) { + discussions(first: 100) { + nodes { + number + title + url + bodyText + createdAt + updatedAt + author { + login + } + labels(first: 10) { + nodes { + name + description + } + } + } + } + } + } + """ + variables = {"org": org, "repo": repo} + # Added a timeout of 10 seconds + response = requests.post(url, json={"query": query, "variables": variables}, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if "errors" in data: + print(f"GraphQL Errors: {json.dumps(data['errors'], indent=2)}") + return data.get("data", {}).get("repository", {}).get("discussions", {}).get("nodes", []) + print(f"Failed to fetch discussions. Status code: {response.status_code}") + print("Response: ", response.text) + return None + + +def process_discussions(discussions: List[Dict], keys_to_remove: List[str]) -> List[Dict]: + """Process a list of discussions to sanitize user information, remove empty values, and remove specified keys.""" + processed_discussions = [] + for discussion in discussions: + sanitized_discussion = sanitize_user_data(discussion) + cleaned_discussion = remove_empty_values(sanitized_discussion) + final_discussion = remove_keys_from_dict(cleaned_discussion, keys_to_remove) + processed_discussions.append(final_discussion) + return processed_discussions + + +# Example usage and other utility functions could follow +# Example usage +user_data = github_token_check(global_token) +orgs = list_user_orgs(global_token) + +# turbomam: Resource not found. This could be due to incorrect organization name or insufficient access permissions. +# Error message: {"message":"Not Found","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"404"} + +# microbiomedata: Access forbidden. Check if your token has the required scopes or if there's a rate limit issue. +# Error message: {"message":"`microbiomedata` forbids access via a personal access token (classic). Please use a GitHub App, OAuth App, or a personal access token with fine-grained permissions.","documentation_url":"https://docs.github.com/rest/repos/repos#list-organization-repositories","status":"403"} + +# works: berkeleybop + +org_name = "microbiomedata" + +print("FETCHING REPOS") +repos = get_repos(org_name, global_token) +write_json_to_file(repos, f"{org_name}_repos.json") + +print("FETCHING ISSUES") +org_issues = fetch_issues(org_name, global_token) +sanitized_issues = process_issues(org_issues, DEFAULT_DROPPED_FIELDS) +write_json_to_file(sanitized_issues, f"{org_name}_issues.json") + +print("FETCHING PRs") +pull_requests = fetch_pull_requests(org_name, global_token) +processed_pull_requests = process_pull_requests(pull_requests, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_pull_requests, f"{org_name}_prs.json") + +print("FETCHING COMMENTS") +comments = fetch_all_comments(org_name, global_token) +processed_comments = process_comments(comments, DEFAULT_DROPPED_FIELDS) +write_json_to_file(processed_comments, f"{org_name}_comments.json") + +print("FETCHING DISCUSSIONS") +all_discussions = fetch_all_discussions(org_name, global_token) +processed_discussions = process_discussions(all_discussions, DEFAULT_DROPPED_FIELDS) +print(f"Total discussions fetched from all repositories: {len(processed_discussions)}") +write_json_to_file(processed_discussions, f"{org_name}_discussions.json") diff --git a/llm_github/foo.py b/llm_github/foo.py deleted file mode 100644 index 8b7396d..0000000 --- a/llm_github/foo.py +++ /dev/null @@ -1,17 +0,0 @@ -def foo(bar: str) -> str: - """Summary line. - - Extended description of function. - - Args: - bar: Description of input argument. - - Returns: - Description of return value - """ - - return bar - - -if __name__ == "__main__": # pragma: no cover - pass diff --git a/local/.env.template b/local/.env.template index 74d0a43..3b926cd 100644 --- a/local/.env.template +++ b/local/.env.template @@ -1 +1 @@ -foo=bar +GITHUB_TOKEN= diff --git a/poetry.lock b/poetry.lock index 1586a71..a940bf6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -15,6 +15,25 @@ files = [ six = ">=1.6.1,<2.0" wheel = ">=0.23.0,<1.0" +[[package]] +name = "attrs" +version = "23.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"}, + {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] +tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] + [[package]] name = "babel" version = "2.15.0" @@ -43,6 +62,31 @@ files = [ {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"}, ] +[[package]] +name = "cattrs" +version = "23.2.3" +description = "Composable complex class support for attrs and dataclasses." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cattrs-23.2.3-py3-none-any.whl", hash = "sha256:0341994d94971052e9ee70662542699a3162ea1e0c62f7ce1b4a57f563685108"}, + {file = "cattrs-23.2.3.tar.gz", hash = "sha256:a934090d95abaa9e911dac357e3a8699e0b4b14f8529bcc7d2b1ad9d51672b9f"}, +] + +[package.dependencies] +attrs = ">=23.1.0" +exceptiongroup = {version = ">=1.1.1", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.1.0,<4.6.3 || >4.6.3", markers = "python_version < \"3.11\""} + +[package.extras] +bson = ["pymongo (>=4.4.0)"] +cbor2 = ["cbor2 (>=5.4.6)"] +msgpack = ["msgpack (>=1.0.5)"] +orjson = ["orjson (>=3.9.2)"] +pyyaml = ["pyyaml (>=6.0)"] +tomlkit = ["tomlkit (>=0.11.8)"] +ujson = ["ujson (>=5.7.0)"] + [[package]] name = "certifi" version = "2024.6.2" @@ -929,6 +973,20 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2024.1" @@ -1123,6 +1181,36 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-cache" +version = "1.2.1" +description = "A persistent cache for python requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests_cache-1.2.1-py3-none-any.whl", hash = "sha256:1285151cddf5331067baa82598afe2d47c7495a1334bfe7a7d329b43e9fd3603"}, + {file = "requests_cache-1.2.1.tar.gz", hash = "sha256:68abc986fdc5b8d0911318fbb5f7c80eebcd4d01bfacc6685ecf8876052511d1"}, +] + +[package.dependencies] +attrs = ">=21.2" +cattrs = ">=22.2" +platformdirs = ">=2.5" +requests = ">=2.22" +url-normalize = ">=1.4" +urllib3 = ">=1.25.5" + +[package.extras] +all = ["boto3 (>=1.15)", "botocore (>=1.18)", "itsdangerous (>=2.0)", "pymongo (>=3)", "pyyaml (>=6.0.1)", "redis (>=3)", "ujson (>=5.4)"] +bson = ["bson (>=0.5)"] +docs = ["furo (>=2023.3,<2024.0)", "linkify-it-py (>=2.0,<3.0)", "myst-parser (>=1.0,<2.0)", "sphinx (>=5.0.2,<6.0.0)", "sphinx-autodoc-typehints (>=1.19)", "sphinx-automodapi (>=0.14)", "sphinx-copybutton (>=0.5)", "sphinx-design (>=0.2)", "sphinx-notfound-page (>=0.8)", "sphinxcontrib-apidoc (>=0.3)", "sphinxext-opengraph (>=0.9)"] +dynamodb = ["boto3 (>=1.15)", "botocore (>=1.18)"] +json = ["ujson (>=5.4)"] +mongodb = ["pymongo (>=3)"] +redis = ["redis (>=3)"] +security = ["itsdangerous (>=2.0)"] +yaml = ["pyyaml (>=6.0.1)"] + [[package]] name = "six" version = "1.16.0" @@ -1183,6 +1271,20 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "url-normalize" +version = "1.4.3" +description = "URL normalization for Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ + {file = "url-normalize-1.4.3.tar.gz", hash = "sha256:d23d3a070ac52a67b83a1c59a0e68f8608d1cd538783b401bc9de2c0fac999b2"}, + {file = "url_normalize-1.4.3-py2.py3-none-any.whl", hash = "sha256:ec3c301f04e5bb676d333a7fa162fa977ad2ca04b7e652bfc9fac4e405728eed"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "urllib3" version = "2.2.2" @@ -1296,4 +1398,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "a3b14f6f7cdd13dce3b5a26933ebce280805aba59061c1f199e1e5f2ad527883" +content-hash = "d86f001781d611b808f7aabbf1e6125bfa210513a12386cabeadb7c5db8db447" diff --git a/pyproject.toml b/pyproject.toml index 09d370a..7077635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ packages = [ [tool.poetry.dependencies] python = ">=3.8,<4.0" +python-dotenv = "^1.0.1" +requests-cache = "^1.2.1" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" diff --git a/tests/test_foo.py b/tests/test_foo.py index 42c9b38..5ec9206 100644 --- a/tests/test_foo.py +++ b/tests/test_foo.py @@ -1,4 +1,4 @@ -from llm_github.foo import foo +from llm_github.core import foo def test_foo():