diff --git a/README.md b/README.md index 0129213..8f2ce18 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ If using the Snyk Github Enterprise Integration type for your Github.com reposit 1. In GitHub.com browse: https://github.com/settings/tokens/new. Or in GitHub Enterprise select your user icon (top-right), then 'Settings', then 'Developer settings', then 'Personal access tokens'. 2. Scopes - Public repos do not need a scope. If you want to scan private repos, then you'll need to enable this scope: `repo` (Full control of private repositories) -## Handling self-signed certificates +### Handling self-signed certificates This tool uses the python requests library, therefore you can point [REQUESTS_CA_BUNDLE](https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification) environment variable to the location of your cert bundle `export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt` @@ -120,3 +120,19 @@ Use the `--dry-run` option to verify the execution plan for the first run | _updated-project-branches.csv | projects with updated default branch | | _update-project-branches-errors.csv | projects that had an error attempting to update default branch | | _repos-skipped-on-error.csv | repos skipped due to import error | + +## Handling of large repositories +The primary method used by this tool to retrieve the GIT tree from each repository for the basis of comparison is via the Github API. +For sufficiently large repositories, though, Github truncates the API response. When a truncated Github response is detected when retrieving the GIT tree, +this tool will fall back on using the local `git` if available and configured to perform a shallow clone of the repository's default branch in order to build the tree. + +It will use /tmp to perform the `git clone` and then capture the output of `git ls-tree -r` + +When this situation occurs, you will see the following in the console: +``` +Large repo detected, falling back to cloning. This may take a few minutes ... +``` + +![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png) + +The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first. diff --git a/app/app.py b/app/app.py index 682b7cc..46c55d7 100755 --- a/app/app.py +++ b/app/app.py @@ -135,7 +135,7 @@ def run(): app_print(snyk_repo.org_name, snyk_repo.full_name, - "Looking for new manifests in code repository") + "Checking for new manifests in source tree") #if not common.ARGS.dry_run: projects_import = snyk_repo.add_new_manifests(common.ARGS.dry_run) diff --git a/app/gh_repo.py b/app/gh_repo.py index 0b3ed5d..ac7b87d 100755 --- a/app/gh_repo.py +++ b/app/gh_repo.py @@ -1,19 +1,84 @@ """utilities for github""" import logging import re +import subprocess import requests from app.models import GithubRepoStatus import common + # suppess InsecureRequestWarning when using --skip-scm-validation option # due to pylint bug # https://github.com/PyCQA/pylint/issues/4584) # pylint: disable=no-member requests.packages.urllib3.disable_warnings() +# pylint: disable=invalid-name +state = { + "tree_already_retrieved": False, + "manifests": [] +} + +def get_git_tree_from_clone(gh_repo): + """ + get git tree for large repos by performing + a shallow clone 'git clone --depth 1' + """ + + tree_full_paths = [] + + # check if git exists on the system + subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL) + + name = gh_repo.name + clone_url = gh_repo.clone_url + default_branch = gh_repo.default_branch + + print(f" - shallow cloning {name} from {clone_url} to /tmp") + + # clone the repo locally + subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True) + subprocess.run( + ["git", "clone", "--depth", "1", clone_url], + check=True, + cwd=common.GIT_CLONE_TEMP_DIR + ) + + print(" - Loading tree from local git structure") + + git_tree = subprocess.run( + [ + "git", + "ls-tree", + "-r", + default_branch + ], + capture_output=True, + check=True, + text=True, + cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}" + ) + + git_tree_lines = git_tree.stdout.splitlines() + print(f" - found {len(git_tree_lines)} tree items ...") + + for line in git_tree_lines: + sha, path = [line.split()[i] for i in (2, 3)] + tree_full_paths.append({ + "sha": sha, + "path": path + }) + + return tree_full_paths + def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code): """retrieve list of all supported manifests in a given github repo""" - manifests = [] + + if state['tree_already_retrieved']: + state['tree_already_retrieved'] = False + return state['manifests'] + + state['manifests'] = [] try: if origin == 'github': gh_repo = common.gh_client.get_repo(snyk_repo_name) @@ -22,20 +87,44 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code): # pylint: disable=bare-except except: if origin == 'github': - gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name) + gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name) elif origin == 'github-enterprise': gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name) - contents = gh_repo.get_git_tree(gh_repo.default_branch, True).tree + tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True) + contents = tree_response.tree + + #pylint: disable=protected-access + is_truncated_str = tree_response._rawData['truncated'] + + if is_truncated_str: + # repo too large to get try via API, just clone it + print(f" - Large repo detected, falling back to cloning. " + f"This may take a few minutes ...") + contents = get_git_tree_from_clone(gh_repo) + # print(f"tree contents: {contents}") while contents: - file_content = contents.pop(0) - if passes_manifest_filter(file_content.path, skip_snyk_code): - manifests.append(file_content.path) - if re.match(common.MANIFEST_PATTERN_CODE, file_content.path): + tree_element = contents.pop(0) + # print(f"tree_element: {tree_element}") + if is_truncated_str: + tree_element_sha = tree_element['sha'] + tree_element_path = tree_element['path'] + else: + tree_element_sha = tree_element.sha + tree_element_path = tree_element.path + full_path = { + "sha": tree_element_sha, + "path": tree_element_path + } + if passes_manifest_filter(full_path['path'], skip_snyk_code): + #print(f"appending to manifests to check") + state['manifests'].append(full_path['path']) + if re.match(common.MANIFEST_PATTERN_CODE, full_path['path']): skip_snyk_code = True - #print(manifests) - return manifests + + state['tree_already_retrieved'] = True + return state['manifests'] def passes_manifest_filter(path, skip_snyk_code=False): """ check if given path should be imported based diff --git a/app/snyk_repo.py b/app/snyk_repo.py index 35df3ed..e42c432 100755 --- a/app/snyk_repo.py +++ b/app/snyk_repo.py @@ -68,6 +68,7 @@ def add_new_manifests(self, dry_run): gh_repo_manifests = get_repo_manifests(self.full_name, self.origin, self.has_snyk_code()) for gh_repo_manifest in gh_repo_manifests: + #print(f"checking to import: {gh_repo_manifest}") if gh_repo_manifest not in {sp['manifest'] for sp in self.snyk_projects}: files.append(dict({"path": gh_repo_manifest})) diff --git a/common.py b/common.py index 025f71a..419131a 100644 --- a/common.py +++ b/common.py @@ -27,6 +27,8 @@ GITHUB_ENTERPRISE_TOKEN = getenv("GITHUB_ENTERPRISE_TOKEN") GITHUB_ENTERPRISE_HOST = getenv("GITHUB_ENTERPRISE_HOST") +GIT_CLONE_TEMP_DIR = "/tmp" + LOG_PREFIX = "snyk-scm-refresh" LOG_FILENAME = LOG_PREFIX + ".log" POTENTIAL_DELETES_FILE = open("%s_potential-repo-deletes.csv" % LOG_PREFIX, "w")