Skip to content

Commit

Permalink
Avoid fetching files over size limit (#814)
Browse files Browse the repository at this point in the history
We never index files over 1MB, unless the "LargeFiles" allowlist is set. So in
most cases, we can avoid fetching them at all.

This PR updates the `git fetch` to filter out files over 1MB when possible, and
exclude tags. It also refactors the very long `gitIndex` method.
  • Loading branch information
jtibshirani authored Sep 5, 2024
1 parent e727656 commit c284149
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 38 deletions.
75 changes: 52 additions & 23 deletions cmd/zoekt-sourcegraph-indexserver/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,11 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
if c.runCmd == nil {
return errors.New("runCmd in provided configuration was nil - a function must be provided")
}
runCmd := c.runCmd

if c.findRepositoryMetadata == nil {
return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided")
}

buildOptions := o.BuildOptions()
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()

Expand All @@ -201,6 +199,25 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
}
defer os.RemoveAll(gitDir) // best-effort cleanup

err = fetchRepo(ctx, gitDir, o, c, logger)
if err != nil {
return err
}

err = setZoektConfig(ctx, gitDir, o, c)
if err != nil {
return err
}

err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger)
if err != nil {
return err
}

return nil
}

func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
// Create a repo to fetch into
cmd := exec.CommandContext(ctx, "git",
// use a random default branch. This is so that HEAD isn't a symref to a
Expand All @@ -212,7 +229,7 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
"--bare",
gitDir)
cmd.Stdin = &bytes.Buffer{}
if err := runCmd(cmd); err != nil {
if err := c.runCmd(cmd); err != nil {
return err
}

Expand All @@ -234,9 +251,16 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
"-C", gitDir,
"-c", "protocol.version=2",
"-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal",
"fetch", "--depth=1", o.CloneURL,
"fetch", "--depth=1", "--no-tags",
}

// If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files.
if len(o.LargeFiles) == 0 {
fetchArgs = append(fetchArgs, "--filter=blob:limit=1m")
}

fetchArgs = append(fetchArgs, o.CloneURL)

var commits []string
for _, b := range branches {
commits = append(commits, b.Version)
Expand All @@ -248,7 +272,7 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
cmd.Stdin = &bytes.Buffer{}

start := time.Now()
err := runCmd(cmd)
err := c.runCmd(cmd)
fetchDuration += time.Since(start)

if err != nil {
Expand Down Expand Up @@ -286,8 +310,8 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
if o.UseDelta {
err := fetchPriorAndLatestCommits()
if err != nil {
name := buildOptions.RepositoryDescription.Name
id := buildOptions.RepositoryDescription.ID
name := o.BuildOptions().RepositoryDescription.Name
id := o.BuildOptions().RepositoryDescription.ID

log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err)
err = fetchOnlyLatestCommits()
Expand All @@ -302,34 +326,37 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
}
}

logger.Debug("successfully fetched git data",
sglog.String("repo", o.Name),
sglog.Uint32("id", o.RepoID),
sglog.Int("commits_count", successfullyFetchedCommitsCount),
sglog.Duration("duration", fetchDuration),
)

// We then create the relevant refs for each fetched commit.
for _, b := range o.Branches {
ref := b.Name
if ref != "HEAD" {
ref = "refs/heads/" + ref
}
cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version)
cmd.Stdin = &bytes.Buffer{}
if err := runCmd(cmd); err != nil {
if err := c.runCmd(cmd); err != nil {
return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err)
}
}

logger.Debug("successfully fetched git data",
sglog.String("repo", o.Name),
sglog.Uint32("id", o.RepoID),
sglog.Int("commits_count", successfullyFetchedCommitsCount),
sglog.Duration("duration", fetchDuration),
)
return nil
}

func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error {
// create git configuration with options
type configKV struct{ Key, Value string }
config := []configKV{{
// zoekt.name is used by zoekt-git-index to set the repository name.
Key: "name",
Value: o.Name,
}}
for k, v := range buildOptions.RepositoryDescription.RawConfig {
for k, v := range o.BuildOptions().RepositoryDescription.RawConfig {
config = append(config, configKV{Key: k, Value: v})
}
sort.Slice(config, func(i, j int) bool {
Expand All @@ -338,13 +365,16 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L

// write git configuration to repo
for _, kv := range config {
cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value)
cmd.Stdin = &bytes.Buffer{}
if err := runCmd(cmd); err != nil {
if err := c.runCmd(cmd); err != nil {
return err
}
}
return nil
}

func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error {
args := []string{
"-submodules=false",
}
Expand Down Expand Up @@ -413,15 +443,14 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L
args = append(args, "-language_map", strings.Join(languageMap, ","))
}

args = append(args, buildOptions.Args()...)
args = append(args, o.BuildOptions().Args()...)
args = append(args, gitDir)

cmd = exec.CommandContext(ctx, "zoekt-git-index", args...)
cmd := exec.CommandContext(ctx, "zoekt-git-index", args...)
cmd.Stdin = &bytes.Buffer{}
if err := runCmd(cmd); err != nil {
if err := c.runCmd(cmd); err != nil {
return err
}

return nil
}

Expand Down
8 changes: 4 additions & 4 deletions cmd/zoekt-sourcegraph-indexserver/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ func TestIndex(t *testing.T) {
},
want: []string{
"git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef",
"git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef",
"git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0",
"git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0",
Expand All @@ -511,7 +511,7 @@ func TestIndex(t *testing.T) {
},
want: []string{
"git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef",
"git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef",
"git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0",
"git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0",
Expand Down Expand Up @@ -541,7 +541,7 @@ func TestIndex(t *testing.T) {
},
want: []string{
"git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed",
"git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef",
"git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed",
"git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0",
Expand Down Expand Up @@ -587,7 +587,7 @@ func TestIndex(t *testing.T) {
},
want: []string{
"git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease",
"git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease",
"git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef",
"git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed",
"git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/release 12345678",
Expand Down
11 changes: 6 additions & 5 deletions cmd/zoekt-sourcegraph-indexserver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ var (
clientMetrics *grpcprom.ClientMetrics
)

// 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22
// NOTE: if you change this, you must also update gitIndex to use the same value when fetching the repo.
const MaxFileSize = 1 << 20

// set of repositories that we want to capture separate indexing metrics for
var reposWithSeparateIndexingMetrics = make(map[string]struct{})

Expand Down Expand Up @@ -656,10 +660,7 @@ func (s *Server) indexArgs(opts IndexOptions) *indexArgs {
IndexDir: s.IndexDir,
Parallelism: parallelism,
Incremental: true,

// 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22
FileLimit: 1 << 20,

FileLimit: MaxFileSize,
ShardMerging: s.shardMerging,
}
}
Expand Down Expand Up @@ -754,7 +755,7 @@ var rootTmpl = template.Must(template.New("name").Parse(`
<a href="?show_repos=false">hide repos</a><br />
<table style="margin-top: 20px">
<th style="text-align:left">Name</th>
<th style="text-align:left">ID</th>
<th style="text-align:left">ID (click to reindex)</th>
{{range .Repos}}
<tr>
<td>{{.Name}}</td>
Expand Down
22 changes: 16 additions & 6 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -887,18 +887,19 @@ func createDocument(key fileKey,
opts build.Options,
) (zoekt.Document, error) {
blob, err := repos[key].Repo.BlobObject(key.ID)

// We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
if errors.Is(err, plumbing.ErrObjectNotFound) {
return skippedLargeDoc(key, branchMap, opts), nil
}

if err != nil {
return zoekt.Document{}, err
}

keyFullPath := key.FullPath()
if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
return zoekt.Document{
SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.SizeMax),
Name: key.FullPath(),
Branches: branchMap[key],
SubRepositoryPath: key.SubRepoPath,
}, nil
return skippedLargeDoc(key, branchMap, opts), nil
}

contents, err := blobContents(blob)
Expand All @@ -922,6 +923,15 @@ func createDocument(key fileKey,
}, nil
}

func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document {
return zoekt.Document{
SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax),
Name: key.FullPath(),
Branches: branchMap[key],
SubRepositoryPath: key.SubRepoPath,
}
}

func blobContents(blob *object.Blob) ([]byte, error) {
r, err := blob.Reader()
if err != nil {
Expand Down

0 comments on commit c284149

Please sign in to comment.