From c284149613946128affc8c3265bc96eb565fb1b1 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 5 Sep 2024 09:53:06 -0700 Subject: [PATCH] Avoid fetching files over size limit (#814) We never index files over 1MB, unless the "LargeFiles" allowlist is set. So in most cases, we can avoid fetching them at all. This PR updates the `git fetch` to filter out files over 1MB when possible, and exclude tags. It also refactors the very long `gitIndex` method. --- cmd/zoekt-sourcegraph-indexserver/index.go | 75 +++++++++++++------ .../index_test.go | 8 +- cmd/zoekt-sourcegraph-indexserver/main.go | 11 +-- gitindex/index.go | 22 ++++-- 4 files changed, 78 insertions(+), 38 deletions(-) diff --git a/cmd/zoekt-sourcegraph-indexserver/index.go b/cmd/zoekt-sourcegraph-indexserver/index.go index a07cf29d..be395276 100644 --- a/cmd/zoekt-sourcegraph-indexserver/index.go +++ b/cmd/zoekt-sourcegraph-indexserver/index.go @@ -185,13 +185,11 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L if c.runCmd == nil { return errors.New("runCmd in provided configuration was nil - a function must be provided") } - runCmd := c.runCmd if c.findRepositoryMetadata == nil { return errors.New("findRepositoryMetadata in provided configuration was nil - a function must be provided") } - buildOptions := o.BuildOptions() ctx, cancel := context.WithTimeout(context.Background(), c.timeout) defer cancel() @@ -201,6 +199,25 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L } defer os.RemoveAll(gitDir) // best-effort cleanup + err = fetchRepo(ctx, gitDir, o, c, logger) + if err != nil { + return err + } + + err = setZoektConfig(ctx, gitDir, o, c) + if err != nil { + return err + } + + err = indexRepo(ctx, gitDir, sourcegraph, o, c, logger) + if err != nil { + return err + } + + return nil +} + +func fetchRepo(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { // Create a repo to fetch into cmd := exec.CommandContext(ctx, "git", // use a random default branch. This is so that HEAD isn't a symref to a @@ -212,7 +229,7 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L "--bare", gitDir) cmd.Stdin = &bytes.Buffer{} - if err := runCmd(cmd); err != nil { + if err := c.runCmd(cmd); err != nil { return err } @@ -234,9 +251,16 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L "-C", gitDir, "-c", "protocol.version=2", "-c", "http.extraHeader=X-Sourcegraph-Actor-UID: internal", - "fetch", "--depth=1", o.CloneURL, + "fetch", "--depth=1", "--no-tags", } + // If there are no exceptions to MaxFileSize (1MB), we can avoid fetching these large files. + if len(o.LargeFiles) == 0 { + fetchArgs = append(fetchArgs, "--filter=blob:limit=1m") + } + + fetchArgs = append(fetchArgs, o.CloneURL) + var commits []string for _, b := range branches { commits = append(commits, b.Version) @@ -248,7 +272,7 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L cmd.Stdin = &bytes.Buffer{} start := time.Now() - err := runCmd(cmd) + err := c.runCmd(cmd) fetchDuration += time.Since(start) if err != nil { @@ -286,8 +310,8 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L if o.UseDelta { err := fetchPriorAndLatestCommits() if err != nil { - name := buildOptions.RepositoryDescription.Name - id := buildOptions.RepositoryDescription.ID + name := o.BuildOptions().RepositoryDescription.Name + id := o.BuildOptions().RepositoryDescription.ID log.Printf("delta build: failed to prepare delta build for %q (ID %d): failed to fetch both latest and prior commits: %s", name, id, err) err = fetchOnlyLatestCommits() @@ -302,26 +326,29 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L } } - logger.Debug("successfully fetched git data", - sglog.String("repo", o.Name), - sglog.Uint32("id", o.RepoID), - sglog.Int("commits_count", successfullyFetchedCommitsCount), - sglog.Duration("duration", fetchDuration), - ) - // We then create the relevant refs for each fetched commit. for _, b := range o.Branches { ref := b.Name if ref != "HEAD" { ref = "refs/heads/" + ref } - cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) + cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "update-ref", ref, b.Version) cmd.Stdin = &bytes.Buffer{} - if err := runCmd(cmd); err != nil { + if err := c.runCmd(cmd); err != nil { return fmt.Errorf("failed update-ref %s to %s: %w", ref, b.Version, err) } } + logger.Debug("successfully fetched git data", + sglog.String("repo", o.Name), + sglog.Uint32("id", o.RepoID), + sglog.Int("commits_count", successfullyFetchedCommitsCount), + sglog.Duration("duration", fetchDuration), + ) + return nil +} + +func setZoektConfig(ctx context.Context, gitDir string, o *indexArgs, c gitIndexConfig) error { // create git configuration with options type configKV struct{ Key, Value string } config := []configKV{{ @@ -329,7 +356,7 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L Key: "name", Value: o.Name, }} - for k, v := range buildOptions.RepositoryDescription.RawConfig { + for k, v := range o.BuildOptions().RepositoryDescription.RawConfig { config = append(config, configKV{Key: k, Value: v}) } sort.Slice(config, func(i, j int) bool { @@ -338,13 +365,16 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L // write git configuration to repo for _, kv := range config { - cmd = exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) + cmd := exec.CommandContext(ctx, "git", "-C", gitDir, "config", "zoekt."+kv.Key, kv.Value) cmd.Stdin = &bytes.Buffer{} - if err := runCmd(cmd); err != nil { + if err := c.runCmd(cmd); err != nil { return err } } + return nil +} +func indexRepo(ctx context.Context, gitDir string, sourcegraph Sourcegraph, o *indexArgs, c gitIndexConfig, logger sglog.Logger) error { args := []string{ "-submodules=false", } @@ -413,15 +443,14 @@ func gitIndex(c gitIndexConfig, o *indexArgs, sourcegraph Sourcegraph, l sglog.L args = append(args, "-language_map", strings.Join(languageMap, ",")) } - args = append(args, buildOptions.Args()...) + args = append(args, o.BuildOptions().Args()...) args = append(args, gitDir) - cmd = exec.CommandContext(ctx, "zoekt-git-index", args...) + cmd := exec.CommandContext(ctx, "zoekt-git-index", args...) cmd.Stdin = &bytes.Buffer{} - if err := runCmd(cmd); err != nil { + if err := c.runCmd(cmd); err != nil { return err } - return nil } diff --git a/cmd/zoekt-sourcegraph-indexserver/index_test.go b/cmd/zoekt-sourcegraph-indexserver/index_test.go index 674e0879..7ff7fcd0 100644 --- a/cmd/zoekt-sourcegraph-indexserver/index_test.go +++ b/cmd/zoekt-sourcegraph-indexserver/index_test.go @@ -489,7 +489,7 @@ func TestIndex(t *testing.T) { }, want: []string{ "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef", + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef", "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", @@ -511,7 +511,7 @@ func TestIndex(t *testing.T) { }, want: []string{ "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef", + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags --filter=blob:limit=1m http://api.test/.internal/git/test/repo deadbeef", "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", "git -C $TMPDIR/test%2Frepo.git config zoekt.fork 0", @@ -541,7 +541,7 @@ func TestIndex(t *testing.T) { }, want: []string{ "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed", + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed", "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed", "git -C $TMPDIR/test%2Frepo.git config zoekt.archived 0", @@ -587,7 +587,7 @@ func TestIndex(t *testing.T) { }, want: []string{ "git -c init.defaultBranch=nonExistentBranchBB0FOFCH32 init --bare $TMPDIR/test%2Frepo.git", - "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease", + "git -C $TMPDIR/test%2Frepo.git -c protocol.version=2 -c http.extraHeader=X-Sourcegraph-Actor-UID: internal fetch --depth=1 --no-tags http://api.test/.internal/git/test/repo deadbeef feebdaed 12345678 oldhead olddev oldrelease", "git -C $TMPDIR/test%2Frepo.git update-ref HEAD deadbeef", "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/dev feebdaed", "git -C $TMPDIR/test%2Frepo.git update-ref refs/heads/release 12345678", diff --git a/cmd/zoekt-sourcegraph-indexserver/main.go b/cmd/zoekt-sourcegraph-indexserver/main.go index f10705ed..65c5ecf0 100644 --- a/cmd/zoekt-sourcegraph-indexserver/main.go +++ b/cmd/zoekt-sourcegraph-indexserver/main.go @@ -136,6 +136,10 @@ var ( clientMetrics *grpcprom.ClientMetrics ) +// 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22 +// NOTE: if you change this, you must also update gitIndex to use the same value when fetching the repo. +const MaxFileSize = 1 << 20 + // set of repositories that we want to capture separate indexing metrics for var reposWithSeparateIndexingMetrics = make(map[string]struct{}) @@ -656,10 +660,7 @@ func (s *Server) indexArgs(opts IndexOptions) *indexArgs { IndexDir: s.IndexDir, Parallelism: parallelism, Incremental: true, - - // 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22 - FileLimit: 1 << 20, - + FileLimit: MaxFileSize, ShardMerging: s.shardMerging, } } @@ -754,7 +755,7 @@ var rootTmpl = template.Must(template.New("name").Parse(` hide repos
- + {{range .Repos}} diff --git a/gitindex/index.go b/gitindex/index.go index 5a295d05..92cf3686 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -887,18 +887,19 @@ func createDocument(key fileKey, opts build.Options, ) (zoekt.Document, error) { blob, err := repos[key].Repo.BlobObject(key.ID) + + // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. + if errors.Is(err, plumbing.ErrObjectNotFound) { + return skippedLargeDoc(key, branchMap, opts), nil + } + if err != nil { return zoekt.Document{}, err } keyFullPath := key.FullPath() if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { - return zoekt.Document{ - SkipReason: fmt.Sprintf("file size %d exceeds maximum size %d", blob.Size, opts.SizeMax), - Name: key.FullPath(), - Branches: branchMap[key], - SubRepositoryPath: key.SubRepoPath, - }, nil + return skippedLargeDoc(key, branchMap, opts), nil } contents, err := blobContents(blob) @@ -922,6 +923,15 @@ func createDocument(key fileKey, }, nil } +func skippedLargeDoc(key fileKey, branchMap map[fileKey][]string, opts build.Options) zoekt.Document { + return zoekt.Document{ + SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), + Name: key.FullPath(), + Branches: branchMap[key], + SubRepositoryPath: key.SubRepoPath, + } +} + func blobContents(blob *object.Blob) ([]byte, error) { r, err := blob.Reader() if err != nil {
NameIDID (click to reindex)
{{.Name}}