Skip to content

Commit

Permalink
Prevent checks from downloading whole repos
Browse files Browse the repository at this point in the history
Repo versions are computed from manifests with updated revisions for each project.
The repo tool can only generate a manifest with accurate sha1 hashes once all projects
have been downloaded locally (repo manifest -r -o new-manifest.xml).
This uses a lot of network bandwith, memory, cpu and disk space and is a huge waste
of time.
git ls-remote is used instead to fetch revisions much quicker before being injected
into the original manifest. A new variable check_jobs is used to spawn concurrent
processes to make it close to X times faster
Adjust jobs and check_jobs variables based on the git servers capabilities/limitations

Signed-off-by: David Rozé <[email protected]>
  • Loading branch information
david-baylibre committed Feb 7, 2024
1 parent 5237dbb commit d9902d4
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 78 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ Track changes in a [repo](https://gerrit.googlesource.com/git-repo/+/master/#rep
* `jobs`: *Optional.* number of jobs to run in parallel (default: 0; based on number of CPU cores)
Reduce this if you observe network errors.

* `check_jobs`: *Optional.* number of jobs to run in parallel in the check step (default: jobs\*2,
2 if jobs is undefined.

### Example

Resource configuration for a public project using repo (Android)
Expand Down
24 changes: 17 additions & 7 deletions repo_resource/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

from repo_resource import common

DEFAULT_CHECK_JOBS = 2


def check(instream) -> list:
"""Checks a json formatted IOstream for new versions
Expand All @@ -33,13 +35,23 @@ def check(instream) -> list:

config = common.source_config_from_payload(payload)

standard_versions = []
for v in payload.get('versions', []):
standard_versions.append(common.Version(v['version']).standard())

if config.private_key != '_invalid':
common.add_private_key_to_agent(config.private_key)

jobs = config.jobs
check_jobs = config.check_jobs or jobs*2 or DEFAULT_CHECK_JOBS

try:
repo = common.Repo()
repo.init(config.url, config.revision, config.name, config.depth)
repo.sync(jobs=config.jobs)
repo = common.Repo(config.url,
config.revision,
config.name,
config.depth)
repo.init()
repo.update_manifest(jobs=check_jobs)
version = repo.currentVersion()
except Exception as e:
raise e
Expand All @@ -48,11 +60,9 @@ def check(instream) -> list:
if config.private_key != '_invalid':
common.remove_private_key_from_agent()

new_version = {'version': str(version)}

versions = payload.get('versions', [])
if versions.count(new_version) == 0:
versions.append(new_version)
if version.standard() not in standard_versions:
versions.append({'version': str(version)})

return versions

Expand Down
212 changes: 183 additions & 29 deletions repo_resource/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,33 @@
import sys
import tempfile
import warnings
import git
import re
import xml.etree.ElementTree as ET

from contextlib import redirect_stdout
from pathlib import Path
from typing import NamedTuple
from urllib.parse import urlparse
from multiprocessing import Pool

import ssh_agent_setup
from repo import manifest_xml
from repo import main as repo


CACHEDIR = Path('/tmp/repo-resource-cache')
SHA1_PATTERN = re.compile(r'^[0-9a-f]{40}$')
EXCLUDE_ATTRS = {'dest-branch', 'upstream'}
DEFAULT_CHECK_JOBS = 2
TAGS = [
'remote',
'default',
'project',
'remove-project',
'superproject',
'contactinfo'
]


def add_private_key_to_agent(private_key: str):
Expand Down Expand Up @@ -56,6 +72,42 @@ def remove_private_key_from_agent():
atexit.unregister(ssh_agent_setup._kill_agent)


def is_sha1(s):
return re.match(SHA1_PATTERN, s)


def multi_run_wrapper(args):
return getRevision(*args)


def getRevision(remote, remoteUrl, project, branch):
"""
Get latest commit sha1 for revision
with git ls-remote command
without downloading the whole repo
"""
# v1.0^{} is the commit referring to tag v1.0
# git ls-remote returns the tag sha1 if left as is
if branch.startswith('refs/tags'):
branch += '^{}'
try:
with redirect_stdout(sys.stderr):
# return tuple (remote/project, revision)
print('Fetching revision for '+remote+'/'+project+'...')
if is_sha1(branch):
return (remote + '/' + project, branch)
g = git.cmd.Git()
url, revision = (
remote + '/' + project,
g.ls_remote(remoteUrl+'/'+project, branch).split()[0]
)
print(url+': '+revision)
return (url, revision)
except Exception as e:
print('Cannot fetch project ', remoteUrl+'/'+project)
print(e)


class SourceConfiguration(NamedTuple):
"""
Supported source configuration items when configuring
Expand All @@ -67,6 +119,7 @@ class SourceConfiguration(NamedTuple):
private_key: str = '_invalid'
depth: int = -1
jobs: int = 0
check_jobs: int = 0


def source_config_from_payload(payload):
Expand All @@ -76,8 +129,10 @@ def source_config_from_payload(payload):
p = SourceConfiguration(**payload['source'])
source_url = urlparse(p.url)

if source_url.netloc == 'gitlab.com' and \
(source_url.scheme == 'http' or source_url.scheme == 'https'):
if (
source_url.netloc == 'gitlab.com' and
re.fullmatch('https?', source_url.scheme)
):
if not source_url.path.endswith('.git'):
raise RuntimeError('gitlab http(s) urls must end with .git')

Expand Down Expand Up @@ -105,6 +160,28 @@ def to_file(self, filename):
def metadata(self) -> str:
return ''

def standard(self) -> str:
try:
root = ET.fromstring(self.__version)
for element in root:
if element.tag not in TAGS:
root.remove(element)
print("Removed ", element.tag)
sorted_xml = sorted(root, key=lambda x: (
TAGS.index(x.tag) if x.tag in TAGS else 999,
x.get('name') or ""))
manifest = ET.Element('manifest')
manifest.extend(sorted_xml)
return ET.canonicalize(
ET.tostring(manifest),
strip_text=True,
exclude_attrs=EXCLUDE_ATTRS
)
except ET.ParseError as e:
with redirect_stdout(sys.stderr):
print('Version is not valid xml')
raise e

def __repr__(self) -> str:
return self.__version

Expand All @@ -121,9 +198,16 @@ class Repo:
such as init/sync and manifest
"""

def __init__(self, workdir=CACHEDIR):
def __init__(self, url, revision='HEAD', name='default.xml',
depth=-1, workdir=CACHEDIR):
self.__workdir = workdir
self.__oldpwd = None
self.__url = url
self.__revision = revision
self.__name = name
self.__depth = depth
self.__version: Version = None
self.__remote = {}
workdir.mkdir(parents=True, exist_ok=True)

# gitrepo from https://github.com/grouperenault/gitrepo
Expand All @@ -145,25 +229,33 @@ def __change_to_workdir(self):
def __restore_oldpwd(self):
os.chdir(self.__oldpwd)

def init(self, url, revision='HEAD', name='default.xml', depth=-1):
def __add_remote(self, remote, url):
self.__remote[remote] = url

def __remote_url(self, remote):
return self.__remote[remote]

def init(self):
self.__change_to_workdir()
try:
# Google's repo prints a lot of information to stdout.
# Concourse expects every logs to be emitted to stderr:
# https://concourse-ci.org/implementing-resource-types.html#implementing-resource-types
# https://concourse-ci.org/implementing-resource-types.html#implementing-resource-types # noqa: E501
with redirect_stdout(sys.stderr):
repo_cmd = [
'--no-pager', 'init', '--quiet', '--manifest-url', url,
'--manifest-name', name,
'--no-tags',
'--no-pager', 'init', '--quiet', '--manifest-url',
self.__url, '--manifest-name',
self.__name, '--no-tags',
]
if depth > 0:
repo_cmd.append('--depth={}'.format(depth))
if self.__depth > 0:
repo_cmd.append('--depth={}'.format(self.__depth))

if revision is not None:
repo_cmd.append('--manifest-branch={}'.format(revision))
if self.__revision is not None:
repo_cmd.append(
'--manifest-branch={}'.format(self.__revision)
)

print('Downloading manifest from {}'.format(url))
print('Downloading manifest from {}'.format(self.__url))
repo._Main(repo_cmd)
print('repo has been initialized in {}'.format(self.__workdir))

Expand All @@ -172,7 +264,7 @@ def init(self, url, revision='HEAD', name='default.xml', depth=-1):
finally:
self.__restore_oldpwd()

def sync(self, version: Version = None, jobs: int = 0):
def sync(self, version: Version, jobs: int = 0):
self.__change_to_workdir()
try:
with redirect_stdout(sys.stderr):
Expand All @@ -185,34 +277,33 @@ def sync(self, version: Version = None, jobs: int = 0):
if jobs > 0:
repo_cmd.append('--jobs={}'.format(jobs))

if version is None:
with tempfile.TemporaryDirectory() as tmpdir:
tmp_manifest = os.path.join(tmpdir, 'manifest_tmp')
version.to_file(tmp_manifest)
repo_cmd.append(
'--manifest-name={}'.format(tmp_manifest))
repo._Main(repo_cmd)
else:
with tempfile.TemporaryDirectory() as tmpdir:
tmp_manifest = os.path.join(tmpdir, 'manifest_tmp')
version.to_file(tmp_manifest)
repo_cmd.append(
'--manifest-name={}'.format(tmp_manifest))
repo._Main(repo_cmd)
if os.listdir(self.__workdir) == []:
raise Exception('Sync failed. Is manifest correct?')
with tempfile.TemporaryDirectory() as tmpdir:
tmp_manifest = os.path.join(tmpdir, 'manifest_snapshot')
self.__manifest_out(tmp_manifest)
self.__version = Version.from_file(tmp_manifest)
except Exception as e:
sys.exit(1)
raise (e)
finally:
self.__restore_oldpwd()

def save_manifest(self, filename):
with redirect_stdout(sys.stderr):
full_path = self.__workdir / filename
current_version = self.currentVersion()
current_version = self.__version
print('Saving manifest to {}'.format(full_path))
current_version.to_file(full_path)

def currentVersion(self) -> Version:
with tempfile.TemporaryDirectory() as tmpdir:
tmp_manifest = os.path.join(tmpdir, 'manifest_snapshot')
self.__manifest_out(tmp_manifest)
version = Version.from_file(tmp_manifest)

return version
return self.__version

def metadata(self):
metadata = []
Expand Down Expand Up @@ -242,3 +333,66 @@ def __manifest_out(self, filename):
raise (e)
finally:
self.__restore_oldpwd()

def update_manifest(self, jobs):
projects = []

jobs = jobs or DEFAULT_CHECK_JOBS
self.__change_to_workdir()
try:
with redirect_stdout(sys.stderr):
print('Updating project revisions in manifest')
xml = ET.parse('.repo/manifests/'+self.__name)
manifest = xml.getroot()

# Get default values from manifest
defaults = manifest.find('default')
if defaults is not None:
defaultRemote = defaults.get('remote')
defaultBranch = defaults.get('revision')

for r in manifest.findall('remote'):
url = r.get('fetch').rstrip('/')
if not url.startswith('http'):
url = re.sub('/[a-z-.]*$', '/', self.__url) + url
self.__add_remote(r.get('name'), url)

for p in manifest.findall('project'):
project = p.get('name')
projectBranch = p.get('revision') or defaultBranch
projectRemote = p.get('remote') or defaultRemote
projectRemoteUrl = self.__remote_url(projectRemote)
projects.append((projectRemote, projectRemoteUrl,
project, projectBranch))

with Pool(jobs) as pool:
revisionList = pool.map(multi_run_wrapper, projects)
# Convert (remote/project, revision) tuple list
# to hash table dict[remote/project]=revision
revisionTable = dict((proj, rev) for proj, rev in revisionList)

# Update revisions
for p in manifest.findall('project'):
project = p.get('name')
projectRemote = p.get('remote') or defaultRemote
p.set('revision', revisionTable[projectRemote+'/'+project])

self.__version = Version(
ET.canonicalize(
ET.tostring(manifest, encoding='unicode'),
strip_text=True
)
)

except FileNotFoundError:
with redirect_stdout(sys.stderr):
print('cannot open', '.repo/manifests/'+self.__name)
sys.exit(1)
except TypeError:
with redirect_stdout(sys.stderr):
print('Error fetching some project repo')
sys.exit(1)
except Exception as e:
raise (e)
finally:
self.__restore_oldpwd()
Loading

0 comments on commit d9902d4

Please sign in to comment.