From 41f62d749ad45c626cdec24e44290bb276b067a3 Mon Sep 17 00:00:00 2001 From: Scott Esbrandt Date: Fri, 22 Apr 2022 00:35:59 -0400 Subject: [PATCH] feat: audit large repos mode --- README.md | 9 ++++++ app/app.py | 20 +++++++++++-- app/gh_repo.py | 62 ++++++++++++++++++++++++++------------- app/utils/github_utils.py | 20 +++++++++++++ app/utils/snyk_helper.py | 7 +++++ common.py | 11 +++++++ snyk_scm_refresh.py | 5 +++- 7 files changed, 111 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8f2ce18..454111e 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ optional arguments: --dry-run Simulate processing of the script without making changes to Snyk --skip-scm-validation Skip validation of the TLS certificate used by the SCM + --audit-large-repos only query github tree api to see if the response is truncated and + log the result. These are the repos that would have be cloned via this tool --debug Write detailed debug data to snyk_scm_refresh.log for troubleshooting ``` @@ -136,3 +138,10 @@ Large repo detected, falling back to cloning. This may take a few minutes ... ![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png) The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first. + +### Auditing which repos are considered large +In order to detect which repositories in snyk are subject the tree truncation issue mentioned above, there is another available option `--audit-large-repos`. +This will only query the git tree via API and look for a truncated response, and then log the results to a file `snyk-scm-refresh_large-repos-audit-results.csv` + +To find all the repos based on a Snyk org, use the `--org-id` parameter in conjunction with `--audit-large-repos` +Optionally you can also supply a repo name to check a single repo by also supplying the `--repo-name` filter. diff --git a/app/app.py b/app/app.py index 46c55d7..1d5b39b 100755 --- a/app/app.py +++ b/app/app.py @@ -9,14 +9,17 @@ from app.models import ImportStatus from app.gh_repo import ( get_gh_repo_status, - is_default_branch_renamed + is_default_branch_renamed, + is_gh_repo_truncated, + get_git_tree_from_api ) from app.utils.snyk_helper import ( get_snyk_repos_from_snyk_orgs, app_print, process_import_status_checks, import_manifests, - log_potential_delete + log_potential_delete, + log_audit_large_repo_result ) def run(): @@ -75,6 +78,19 @@ def run(): log_potential_delete(snyk_repo.org_name, snyk_repo.full_name) elif gh_repo_status.response_code == 200: # project exists and has not been renamed + # if --audit-large-repos is on + if common.ARGS.audit_large_repos: + is_truncated_str = \ + is_gh_repo_truncated( + get_git_tree_from_api(snyk_repo.full_name, snyk_repo.origin) + ) + log_audit_large_repo_result( + snyk_repo.org_name, + snyk_repo.full_name, + str(bool(is_truncated_str)) + ) + # move to next repo without processing the rest of the code + continue # snyk has the wrong branch, re-import if gh_repo_status.repo_default_branch != snyk_repo.branch: app_print(snyk_repo.org_name, diff --git a/app/gh_repo.py b/app/gh_repo.py index ac7b87d..2117700 100755 --- a/app/gh_repo.py +++ b/app/gh_repo.py @@ -1,9 +1,14 @@ """utilities for github""" import logging import re +import sys import subprocess import requests from app.models import GithubRepoStatus +from app.utils.github_utils import ( + get_github_client, + get_github_repo +) import common @@ -19,7 +24,7 @@ "manifests": [] } -def get_git_tree_from_clone(gh_repo): +def get_git_tree_from_clone(repo_name, origin): """ get git tree for large repos by performing a shallow clone 'git clone --depth 1' @@ -27,6 +32,9 @@ def get_git_tree_from_clone(gh_repo): tree_full_paths = [] + gh_client = get_github_client(origin) + gh_repo = get_github_repo(gh_client, repo_name) + # check if git exists on the system subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL) @@ -34,10 +42,20 @@ def get_git_tree_from_clone(gh_repo): clone_url = gh_repo.clone_url default_branch = gh_repo.default_branch - print(f" - shallow cloning {name} from {clone_url} to /tmp") + GIT_CLONE_PATH = f"{common.GIT_CLONE_TEMP_DIR}/{name}" + + # check that GIT_CLONE_PATH is set safely for deletion + if re.match(f'{common.GIT_CLONE_TEMP_DIR}/.+', GIT_CLONE_PATH) and \ + re.match(rf'\/.+\/.+', GIT_CLONE_PATH): + pass + else: + sys.exit(f"could not determine that the temp cloning directory" + f"{GIT_CLONE_PATH} was set properly, exiting...") + + print(f" - shallow cloning {name} from {clone_url} to {GIT_CLONE_PATH}") # clone the repo locally - subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True) + subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True) subprocess.run( ["git", "clone", "--depth", "1", clone_url], check=True, @@ -56,9 +74,12 @@ def get_git_tree_from_clone(gh_repo): capture_output=True, check=True, text=True, - cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}" + cwd=f"{GIT_CLONE_PATH}" ) + print(f" - removing cloned files in /tmp...") + subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True) + git_tree_lines = git_tree.stdout.splitlines() print(f" - found {len(git_tree_lines)} tree items ...") @@ -71,6 +92,18 @@ def get_git_tree_from_clone(gh_repo): return tree_full_paths +def is_gh_repo_truncated(gh_tree_response) -> bool: + """ check if repo is truncated """ + #pylint: disable=protected-access + return gh_tree_response._rawData['truncated'] + +def get_git_tree_from_api(repo_name, origin): + """ get git tree for repo via API call """ + gh_client = get_github_client(origin) + gh_repo = get_github_repo(gh_client, repo_name) + + return gh_repo.get_git_tree(gh_repo.default_branch, True) + def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code): """retrieve list of all supported manifests in a given github repo""" @@ -79,29 +112,18 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code): return state['manifests'] state['manifests'] = [] - try: - if origin == 'github': - gh_repo = common.gh_client.get_repo(snyk_repo_name) - elif origin == 'github-enterprise': - gh_repo = common.gh_enterprise_client.get_repo(snyk_repo_name) - # pylint: disable=bare-except - except: - if origin == 'github': - gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name) - elif origin == 'github-enterprise': - gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name) - - tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True) + + tree_response = get_git_tree_from_api(snyk_repo_name, origin) + contents = tree_response.tree - #pylint: disable=protected-access - is_truncated_str = tree_response._rawData['truncated'] + is_truncated_str = is_gh_repo_truncated(tree_response) if is_truncated_str: # repo too large to get try via API, just clone it print(f" - Large repo detected, falling back to cloning. " f"This may take a few minutes ...") - contents = get_git_tree_from_clone(gh_repo) + contents = get_git_tree_from_clone(snyk_repo_name, origin) # print(f"tree contents: {contents}") while contents: diff --git a/app/utils/github_utils.py b/app/utils/github_utils.py index dc08a91..63fef62 100644 --- a/app/utils/github_utils.py +++ b/app/utils/github_utils.py @@ -3,6 +3,7 @@ github enterprise clients """ from github import Github +import common # pylint: disable=invalid-name def create_github_client(GITHUB_TOKEN, VERIFY_TLS): @@ -23,3 +24,22 @@ def create_github_enterprise_client(GITHUB_ENTERPRISE_TOKEN, GITHUB_ENTERPRISE_H raise RuntimeError( "Failed to initialize GitHub client because GITHUB_ENTERPRISE_TOKEN is not set!" ) from err + +def get_github_client(origin): + """ get the right github client depending on intergration type """ + #pylint: disable=no-else-return + if origin == 'github': + return common.gh_client + elif origin == 'github-enterprise': + return common.gh_enterprise_client + else: + raise Exception(f"could not get github client for type: {origin}") + +def get_github_repo(gh_client, repo_name): + """ get a github repo by name """ + try: + return gh_client.get_repo(repo_name) + # pylint: disable=bare-except + except: + return gh_client.get_user().get_repo(repo_name) + \ No newline at end of file diff --git a/app/utils/snyk_helper.py b/app/utils/snyk_helper.py index 211fc09..1a1c266 100644 --- a/app/utils/snyk_helper.py +++ b/app/utils/snyk_helper.py @@ -32,6 +32,13 @@ def log_update_project_branch_error(org_name, project_id, project_name, new_bran f"{project_id}," f"{new_branch}\n") +def log_audit_large_repo_result(org_name: str, repo_name: str, is_large: str): + """ Log audit large repo result """ + common.LARGE_REPOS_AUDIT_RESULTS_FILE.write( + f"{org_name}," + f"{repo_name}," + f"{is_large}\n") + def get_snyk_repos_from_snyk_orgs(snyk_orgs, ARGS): """Build list of repositories from a given list of Snyk orgs""" snyk_repos = [] diff --git a/common.py b/common.py index 419131a..7e8192b 100644 --- a/common.py +++ b/common.py @@ -61,6 +61,10 @@ "%s_update-project-branches-errors.csv" % LOG_PREFIX, "w" ) UPDATE_PROJECT_BRANCHES_ERRORS_FILE.write("org,project_name,project_id,new_branch\n") +LARGE_REPOS_AUDIT_RESULTS_FILE = open( + "%s_large-repos-audit-results.csv" % LOG_PREFIX, "w" +) +LARGE_REPOS_AUDIT_RESULTS_FILE.write("org,repo,is_large\n") PENDING_REMOVAL_MAX_CHECKS = 45 PENDING_REMOVAL_CHECK_INTERVAL = 20 @@ -123,6 +127,13 @@ def parse_command_line_args(): required=False, action="store_true", ) + parser.add_argument( + "--audit-large-repos", + help="only query github tree api to see if the response is truncated and \ + log the result. These are the repos that would have be cloned via this tool", + required=False, + action="store_true", + ) parser.add_argument( "--debug", help="Write detailed debug data to snyk_scm_refresh.log for troubleshooting", diff --git a/snyk_scm_refresh.py b/snyk_scm_refresh.py index c54236e..7aa5951 100755 --- a/snyk_scm_refresh.py +++ b/snyk_scm_refresh.py @@ -10,7 +10,10 @@ if __name__ == "__main__": - if common.ARGS.dry_run: + if common.ARGS.audit_large_repos: + print("\n****** AUDIT LARGE REPOS MODE ******\n") + print(f"check {common.LARGE_REPOS_AUDIT_RESULTS_FILE.name} after script completes\n") + elif common.ARGS.dry_run: print("\n****** DRY-RUN MODE ******\n") for arg in vars(common.ARGS): if any(arg in x for x in ['sca', 'container', 'iac', 'code']):