Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

Commit

Permalink
Merge pull request #91 from snyk-tech-services/develop
Browse files Browse the repository at this point in the history
release changes
  • Loading branch information
scott-es authored Apr 18, 2022
2 parents 8f6a71b + f22578b commit 4ecefbf
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 11 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ If using the Snyk Github Enterprise Integration type for your Github.com reposit
1. In GitHub.com browse: https://github.com/settings/tokens/new. Or in GitHub Enterprise select your user icon (top-right), then 'Settings', then 'Developer settings', then 'Personal access tokens'.
2. Scopes - Public repos do not need a scope. If you want to scan private repos, then you'll need to enable this scope: `repo` (Full control of private repositories)

## Handling self-signed certificates
### Handling self-signed certificates
This tool uses the python requests library, therefore you can point [REQUESTS_CA_BUNDLE](https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification) environment variable to the location of your cert bundle

`export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt`
Expand Down Expand Up @@ -120,3 +120,19 @@ Use the `--dry-run` option to verify the execution plan for the first run
| _updated-project-branches.csv | projects with updated default branch |
| _update-project-branches-errors.csv | projects that had an error attempting to update default branch |
| _repos-skipped-on-error.csv | repos skipped due to import error |

## Handling of large repositories
The primary method used by this tool to retrieve the GIT tree from each repository for the basis of comparison is via the Github API.
For sufficiently large repositories, though, Github truncates the API response. When a truncated Github response is detected when retrieving the GIT tree,
this tool will fall back on using the local `git` if available and configured to perform a shallow clone of the repository's default branch in order to build the tree.

It will use /tmp to perform the `git clone` and then capture the output of `git ls-tree -r`

When this situation occurs, you will see the following in the console:
```
Large repo detected, falling back to cloning. This may take a few minutes ...
```

![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png)

The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first.
2 changes: 1 addition & 1 deletion app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def run():

app_print(snyk_repo.org_name,
snyk_repo.full_name,
"Looking for new manifests in code repository")
"Checking for new manifests in source tree")

#if not common.ARGS.dry_run:
projects_import = snyk_repo.add_new_manifests(common.ARGS.dry_run)
Expand Down
107 changes: 98 additions & 9 deletions app/gh_repo.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,84 @@
"""utilities for github"""
import logging
import re
import subprocess
import requests
from app.models import GithubRepoStatus
import common


# suppess InsecureRequestWarning when using --skip-scm-validation option
# due to pylint bug
# https://github.com/PyCQA/pylint/issues/4584)
# pylint: disable=no-member
requests.packages.urllib3.disable_warnings()

# pylint: disable=invalid-name
state = {
"tree_already_retrieved": False,
"manifests": []
}

def get_git_tree_from_clone(gh_repo):
"""
get git tree for large repos by performing
a shallow clone 'git clone --depth 1'
"""

tree_full_paths = []

# check if git exists on the system
subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL)

name = gh_repo.name
clone_url = gh_repo.clone_url
default_branch = gh_repo.default_branch

print(f" - shallow cloning {name} from {clone_url} to /tmp")

# clone the repo locally
subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True)
subprocess.run(
["git", "clone", "--depth", "1", clone_url],
check=True,
cwd=common.GIT_CLONE_TEMP_DIR
)

print(" - Loading tree from local git structure")

git_tree = subprocess.run(
[
"git",
"ls-tree",
"-r",
default_branch
],
capture_output=True,
check=True,
text=True,
cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}"
)

git_tree_lines = git_tree.stdout.splitlines()
print(f" - found {len(git_tree_lines)} tree items ...")

for line in git_tree_lines:
sha, path = [line.split()[i] for i in (2, 3)]
tree_full_paths.append({
"sha": sha,
"path": path
})

return tree_full_paths

def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
"""retrieve list of all supported manifests in a given github repo"""
manifests = []

if state['tree_already_retrieved']:
state['tree_already_retrieved'] = False
return state['manifests']

state['manifests'] = []
try:
if origin == 'github':
gh_repo = common.gh_client.get_repo(snyk_repo_name)
Expand All @@ -22,20 +87,44 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
# pylint: disable=bare-except
except:
if origin == 'github':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)
gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)

contents = gh_repo.get_git_tree(gh_repo.default_branch, True).tree
tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True)
contents = tree_response.tree

#pylint: disable=protected-access
is_truncated_str = tree_response._rawData['truncated']

if is_truncated_str:
# repo too large to get try via API, just clone it
print(f" - Large repo detected, falling back to cloning. "
f"This may take a few minutes ...")
contents = get_git_tree_from_clone(gh_repo)
# print(f"tree contents: {contents}")

while contents:
file_content = contents.pop(0)
if passes_manifest_filter(file_content.path, skip_snyk_code):
manifests.append(file_content.path)
if re.match(common.MANIFEST_PATTERN_CODE, file_content.path):
tree_element = contents.pop(0)
# print(f"tree_element: {tree_element}")
if is_truncated_str:
tree_element_sha = tree_element['sha']
tree_element_path = tree_element['path']
else:
tree_element_sha = tree_element.sha
tree_element_path = tree_element.path
full_path = {
"sha": tree_element_sha,
"path": tree_element_path
}
if passes_manifest_filter(full_path['path'], skip_snyk_code):
#print(f"appending to manifests to check")
state['manifests'].append(full_path['path'])
if re.match(common.MANIFEST_PATTERN_CODE, full_path['path']):
skip_snyk_code = True
#print(manifests)
return manifests

state['tree_already_retrieved'] = True
return state['manifests']

def passes_manifest_filter(path, skip_snyk_code=False):
""" check if given path should be imported based
Expand Down
1 change: 1 addition & 0 deletions app/snyk_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def add_new_manifests(self, dry_run):
gh_repo_manifests = get_repo_manifests(self.full_name, self.origin, self.has_snyk_code())

for gh_repo_manifest in gh_repo_manifests:
#print(f"checking to import: {gh_repo_manifest}")
if gh_repo_manifest not in {sp['manifest'] for sp in self.snyk_projects}:
files.append(dict({"path": gh_repo_manifest}))

Expand Down
2 changes: 2 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
GITHUB_ENTERPRISE_TOKEN = getenv("GITHUB_ENTERPRISE_TOKEN")
GITHUB_ENTERPRISE_HOST = getenv("GITHUB_ENTERPRISE_HOST")

GIT_CLONE_TEMP_DIR = "/tmp"

LOG_PREFIX = "snyk-scm-refresh"
LOG_FILENAME = LOG_PREFIX + ".log"
POTENTIAL_DELETES_FILE = open("%s_potential-repo-deletes.csv" % LOG_PREFIX, "w")
Expand Down

0 comments on commit 4ecefbf

Please sign in to comment.