diff --git a/cmd/zoekt-repo-index/main.go b/cmd/zoekt-repo-index/main.go index 1ca2d7fa7..34baabd5f 100644 --- a/cmd/zoekt-repo-index/main.go +++ b/cmd/zoekt-repo-index/main.go @@ -43,6 +43,7 @@ import ( "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/build" "github.com/sourcegraph/zoekt/gitindex" + "github.com/sourcegraph/zoekt/ignore" "go.uber.org/automaxprocs/maxprocs" git "github.com/go-git/go-git/v5" @@ -180,7 +181,7 @@ func main() { } } - perBranch := map[string]map[fileKey]gitindex.BlobRepo{} + perBranch := map[string]map[fileKey]gitindex.BlobLocation{} opts.SubRepositories = map[string]*zoekt.Repository{} // branch => repo => version @@ -325,8 +326,8 @@ func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, func iterateManifest(mf *manifest.Manifest, baseURL url.URL, revPrefix string, cache *gitindex.RepoCache, -) (map[fileKey]gitindex.BlobRepo, map[string]plumbing.Hash, error) { - allFiles := map[fileKey]gitindex.BlobRepo{} +) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { + allFiles := map[fileKey]gitindex.BlobLocation{} allVersions := map[string]plumbing.Hash{} for _, p := range mf.Project { rev := mf.ProjectRevision(&p) @@ -359,12 +360,13 @@ func iterateManifest(mf *manifest.Manifest, return nil, nil, err } - files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache) + rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache) + versions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{}) if err != nil { return nil, nil, err } - for key, repo := range files { + for key, repo := range rw.Files { allFiles[fileKey{ SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), Path: key.Path, diff --git a/gitindex/index.go b/gitindex/index.go index 3eae74d7a..fea30423e 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -426,7 +426,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { } // branch => (path, sha1) => repo. - var repos map[fileKey]BlobIndexInfo + var repos map[fileKey]BlobLocation // Branch => Repo => SHA1 var branchVersions map[string]map[string]plumbing.Hash @@ -452,7 +452,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { } } - reposByPath := map[string]BlobIndexInfo{} + reposByPath := map[string]BlobLocation{} for key, info := range repos { reposByPath[key.SubRepoPath] = info } @@ -461,9 +461,9 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { for path, info := range reposByPath { tpl := opts.BuildOptions.RepositoryDescription if path != "" { - tpl = zoekt.Repository{URL: info.Repo.URL.String()} - if err := SetTemplatesFromOrigin(&tpl, info.Repo.URL); err != nil { - log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.Repo.URL, err) + tpl = zoekt.Repository{URL: info.URL.String()} + if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil { + log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err) } } opts.BuildOptions.SubRepositories[path] = &tpl @@ -592,11 +592,11 @@ func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) { // prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing // a build.Builder instance for generating a delta build. -type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) +type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) // prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing // a build.Builder instance for generating a normal build. -type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) +type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) type gitIndexConfig struct { // prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to @@ -612,7 +612,7 @@ type gitIndexConfig struct { prepareNormalBuild prepareNormalBuildFunc } -func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { +func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { if options.Submodules { return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing") } @@ -670,7 +670,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f } // branch => (path, sha1) => repo. - repos = map[fileKey]BlobIndexInfo{} + repos = map[fileKey]BlobLocation{} // branch name -> git worktree at most current commit branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches)) @@ -683,7 +683,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f tree, err := commit.Tree() if err != nil { - return nil, nil, nil, fmt.Errorf("getting current git tree for branch %q: %w", b, err) + return nil, nil, nil, fmt.Errorf("getting current git Files for branch %q: %w", b, err) } branchToCurrentTree[b] = tree @@ -696,12 +696,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f } // TODO: Support repository submodules for delta builds - // For this prototype, we are ignoring repository submodules, which means that we can use the same - // blob location for all files - hackSharedBlobLocation := BlobRepo{ - GitRepo: repository, - URL: u, - } + // loop over all branches, calculate the diff between our // last indexed commit and the current commit, and add files mentioned in the diff for _, branch := range existingRepository.Branches { @@ -712,7 +707,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f lastIndexedTree, err := lastIndexedCommit.Tree() if err != nil { - return nil, nil, nil, fmt.Errorf("getting lasted indexed git tree for branch %q: %w", branch.Name, err) + return nil, nil, nil, fmt.Errorf("getting lasted indexed git Files for branch %q: %w", branch.Name, err) } changes, err := object.DiffTreeWithOptions(context.Background(), lastIndexedTree, branchToCurrentTree[branch.Name], &object.DiffTreeOptions{DetectRenames: false}) @@ -742,8 +737,9 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f existing.Branches = append(existing.Branches, branch.Name) repos[file] = existing } else { - repos[file] = BlobIndexInfo{ - Repo: hackSharedBlobLocation, + repos[file] = BlobLocation{ + GitRepo: repository, + URL: u, Branches: []string{branch.Name}, } } @@ -780,8 +776,9 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f existing.Branches = append(existing.Branches, b) repos[file] = existing } else { - repos[file] = BlobIndexInfo{ - Repo: hackSharedBlobLocation, + repos[file] = BlobLocation{ + GitRepo: repository, + URL: u, Branches: []string{b}, } } @@ -806,15 +803,12 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f return repos, nil, changedOrDeletedPaths, nil } -func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) { +func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { var repoCache *RepoCache if options.Submodules { repoCache = NewRepoCache(options.RepoCacheDir) } - // branch => (path, sha1) => metadata. - repos = map[fileKey]BlobIndexInfo{} - // Branch => Repo => SHA1 branchVersions = map[string]map[string]plumbing.Hash{} @@ -823,6 +817,7 @@ func prepareNormalBuild(options Options, repository *git.Repository) (repos map[ return nil, nil, fmt.Errorf("expandBranches: %w", err) } + rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache) for _, b := range branches { commit, err := getCommit(repository, options.BranchPrefix, b) if err != nil { @@ -843,35 +838,23 @@ func prepareNormalBuild(options Options, repository *git.Repository) (repos map[ return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err) } - files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache) + subVersions, err := rw.CollectFiles(tree, b, ig) if err != nil { - return nil, nil, fmt.Errorf("TreeToFiles: %w", err) - } - for k, v := range files { - if ig.Match(k.Path) { - continue - } - - if existing, ok := repos[k]; ok { - existing.Branches = append(existing.Branches, b) - repos[k] = existing - } else { - repos[k] = BlobIndexInfo{Repo: v, Branches: []string{b}} - } + return nil, nil, fmt.Errorf("CollectFiles: %w", err) } branchVersions[b] = subVersions } - return repos, branchVersions, nil + return rw.Files, branchVersions, nil } func createDocument(key fileKey, - repos map[fileKey]BlobIndexInfo, + repos map[fileKey]BlobLocation, ranks repoPathRanks, opts build.Options, ) (zoekt.Document, error) { - repo := repos[key].Repo + repo := repos[key] blob, err := repo.GitRepo.BlobObject(key.ID) branches := repos[key].Branches diff --git a/gitindex/index_test.go b/gitindex/index_test.go index fff8d8ab9..c18d7ed34 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -608,13 +608,13 @@ func TestIndexDeltaBasic(t *testing.T) { // setup: prepare spy versions of prepare delta / normal build so that we can observe // whether they were called appropriately deltaBuildCalled := false - prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { + prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) { deltaBuildCalled = true return prepareDeltaBuild(options, repository) } normalBuildCalled := false - prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) { + prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) { normalBuildCalled = true return prepareNormalBuild(options, repository) } diff --git a/gitindex/tree.go b/gitindex/tree.go index 31b480cb2..7b64eb801 100644 --- a/gitindex/tree.go +++ b/gitindex/tree.go @@ -26,32 +26,32 @@ import ( "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/filemode" "github.com/go-git/go-git/v5/plumbing/object" + "github.com/sourcegraph/zoekt/ignore" - git "github.com/go-git/go-git/v5" + "github.com/go-git/go-git/v5" ) -// repoWalker walks a tree, recursing into submodules. -type repoWalker struct { - repo *git.Repository +// RepoWalker walks one or more commit trees, collecting the files to index in its Files map. +// +// It also recurses into submodules if Options.Submodules is enabled. +type RepoWalker struct { + Files map[fileKey]BlobLocation + repo *git.Repository repoURL *url.URL - tree map[fileKey]BlobRepo // Path => SubmoduleEntry submodules map[string]*SubmoduleEntry - - // Path => commit SHA1 - subRepoVersions map[string]plumbing.Hash - repoCache *RepoCache + repoCache *RepoCache } // subURL returns the URL for a submodule. -func (w *repoWalker) subURL(relURL string) (*url.URL, error) { - if w.repoURL == nil { +func (rw *RepoWalker) subURL(relURL string) (*url.URL, error) { + if rw.repoURL == nil { return nil, fmt.Errorf("no URL for base repo") } if strings.HasPrefix(relURL, "../") { - u := *w.repoURL + u := *rw.repoURL u.Path = path.Join(u.Path, relURL) return &u, nil } @@ -59,20 +59,19 @@ func (w *repoWalker) subURL(relURL string) (*url.URL, error) { return url.Parse(relURL) } -// newRepoWalker creates a new repoWalker. -func newRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *repoWalker { +// NewRepoWalker creates a new RepoWalker. +func NewRepoWalker(r *git.Repository, repoURL string, repoCache *RepoCache) *RepoWalker { u, _ := url.Parse(repoURL) - return &repoWalker{ - repo: r, - repoURL: u, - tree: map[fileKey]BlobRepo{}, - repoCache: repoCache, - subRepoVersions: map[string]plumbing.Hash{}, + return &RepoWalker{ + repo: r, + repoURL: u, + Files: map[fileKey]BlobLocation{}, + repoCache: repoCache, } } // parseModuleMap initializes rw.submodules. -func (rw *repoWalker) parseModuleMap(t *object.Tree) error { +func (rw *RepoWalker) parseModuleMap(t *object.Tree) error { if rw.repoCache == nil { return nil } @@ -94,49 +93,57 @@ func (rw *repoWalker) parseModuleMap(t *object.Tree) error { return nil } -// TreeToFiles fetches the blob SHA1s for a tree. If repoCache is +// CollectFiles fetches the blob SHA1s for the tree. If repoCache is // non-nil, recurse into submodules. In addition, it returns a mapping // that indicates in which repo each SHA1 can be found. -func TreeToFiles(r *git.Repository, t *object.Tree, repoURL string, repoCache *RepoCache) (map[fileKey]BlobRepo, map[string]plumbing.Hash, error) { - rw := newRepoWalker(r, repoURL, repoCache) - +// +// The collected files are available through the RepoWalker.Files map. +func (rw *RepoWalker) CollectFiles(t *object.Tree, branch string, ig *ignore.Matcher) (map[string]plumbing.Hash, error) { if err := rw.parseModuleMap(t); err != nil { - return nil, nil, fmt.Errorf("parseModuleMap: %w", err) + return nil, fmt.Errorf("parseModuleMap: %w", err) + } + + ig, err := newIgnoreMatcher(t) + if err != nil { + return nil, fmt.Errorf("newIgnoreMatcher: %w", err) } tw := object.NewTreeWalker(t, true, make(map[plumbing.Hash]bool)) defer tw.Close() + + // Path => commit SHA1 + subRepoVersions := make(map[string]plumbing.Hash) for { name, entry, err := tw.Next() if err == io.EOF { break } - if err := rw.handleEntry(name, &entry); err != nil { - return nil, nil, fmt.Errorf("handleEntry: %w", err) + if err := rw.handleEntry(name, &entry, branch, subRepoVersions, ig); err != nil { + return nil, fmt.Errorf("handleEntry: %w", err) } } - return rw.tree, rw.subRepoVersions, nil + return subRepoVersions, nil } -func (r *repoWalker) tryHandleSubmodule(p string, id *plumbing.Hash) error { - if err := r.handleSubmodule(p, id); err != nil { +func (rw *RepoWalker) tryHandleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { + if err := rw.handleSubmodule(p, id, branch, subRepoVersions, ig); err != nil { log.Printf("submodule %s: ignoring error %v", p, err) } return nil } -func (r *repoWalker) handleSubmodule(p string, id *plumbing.Hash) error { - submod := r.submodules[p] +func (rw *RepoWalker) handleSubmodule(p string, id *plumbing.Hash, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { + submod := rw.submodules[p] if submod == nil { - return fmt.Errorf("no entry for submodule path %q", r.repoURL) + return fmt.Errorf("no entry for submodule path %q", rw.repoURL) } - subURL, err := r.subURL(submod.URL) + subURL, err := rw.subURL(submod.URL) if err != nil { return err } - subRepo, err := r.repoCache.Open(subURL) + subRepo, err := rw.repoCache.Open(subURL) if err != nil { return err } @@ -150,28 +157,29 @@ func (r *repoWalker) handleSubmodule(p string, id *plumbing.Hash) error { return err } - r.subRepoVersions[p] = *id + subRepoVersions[p] = *id - subTree, subVersions, err := TreeToFiles(subRepo, tree, subURL.String(), r.repoCache) + sw := NewRepoWalker(subRepo, subURL.String(), rw.repoCache) + subVersions, err := sw.CollectFiles(tree, branch, ig) if err != nil { return err } - for k, repo := range subTree { - r.tree[fileKey{ + for k, repo := range sw.Files { + rw.Files[fileKey{ SubRepoPath: filepath.Join(p, k.SubRepoPath), Path: k.Path, ID: k.ID, }] = repo } for k, v := range subVersions { - r.subRepoVersions[filepath.Join(p, k)] = v + subRepoVersions[filepath.Join(p, k)] = v } return nil } -func (r *repoWalker) handleEntry(p string, e *object.TreeEntry) error { - if e.Mode == filemode.Submodule && r.repoCache != nil { - if err := r.tryHandleSubmodule(p, &e.Hash); err != nil { +func (rw *RepoWalker) handleEntry(p string, e *object.TreeEntry, branch string, subRepoVersions map[string]plumbing.Hash, ig *ignore.Matcher) error { + if e.Mode == filemode.Submodule && rw.repoCache != nil { + if err := rw.tryHandleSubmodule(p, &e.Hash, branch, subRepoVersions, ig); err != nil { return fmt.Errorf("submodule %s: %v", p, err) } } @@ -182,10 +190,19 @@ func (r *repoWalker) handleEntry(p string, e *object.TreeEntry) error { return nil } - r.tree[fileKey{Path: p, ID: e.Hash}] = BlobRepo{ - GitRepo: r.repo, - URL: r.repoURL, + // Skip ignored files + if ig.Match(p) { + return nil } + + key := fileKey{Path: p, ID: e.Hash} + if existing, ok := rw.Files[key]; ok { + existing.Branches = append(existing.Branches, branch) + rw.Files[key] = existing + } else { + rw.Files[key] = BlobLocation{GitRepo: rw.repo, URL: rw.repoURL, Branches: []string{branch}} + } + return nil } @@ -201,20 +218,17 @@ func (k *fileKey) FullPath() string { return filepath.Join(k.SubRepoPath, k.Path) } -// BlobIndexInfo contains information about the blob that's needed for indexing. -type BlobIndexInfo struct { - Repo BlobRepo - // Branches is the list of branches that contain the blob. - Branches []string -} - -// BlobRepo holds the repo where the blob can be found. -type BlobRepo struct { +// BlobLocation holds the repo where the blob can be found, plus other information +// needed for indexing like its branches. +type BlobLocation struct { GitRepo *git.Repository URL *url.URL + + // Branches is the list of branches that contain the blob. + Branches []string } -func (l *BlobRepo) Blob(id *plumbing.Hash) ([]byte, error) { +func (l *BlobLocation) Blob(id *plumbing.Hash) ([]byte, error) { blob, err := l.GitRepo.BlobObject(*id) if err != nil { return nil, err diff --git a/gitindex/tree_test.go b/gitindex/tree_test.go index 406c4b175..ba4518a29 100644 --- a/gitindex/tree_test.go +++ b/gitindex/tree_test.go @@ -29,6 +29,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/ignore" "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/build" @@ -139,7 +140,7 @@ func TestFindGitRepos(t *testing.T) { } } -func TestTreeToFiles(t *testing.T) { +func TestCollectFiles(t *testing.T) { dir := t.TempDir() if err := createSubmoduleRepo(dir); err != nil { @@ -168,9 +169,10 @@ func TestTreeToFiles(t *testing.T) { t.Fatalf("AsTree: %v", err) } - files, versions, err := TreeToFiles(repo, tree, aURL.String(), cache) + rw := NewRepoWalker(repo, aURL.String(), cache) + versions, err := rw.CollectFiles(tree, "main", &ignore.Matcher{}) if err != nil { - t.Fatalf("TreeToFiles: %v", err) + t.Fatalf("CollectFiles: %v", err) } bnameHash := versions["bname"] @@ -181,7 +183,7 @@ func TestTreeToFiles(t *testing.T) { } var paths []string - for k := range files { + for k := range rw.Files { paths = append(paths, k.FullPath()) } sort.Strings(paths)