From 155050eb98d3e71e629c3d527f11b3c2047886aa Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 10 Jan 2024 18:11:01 +0200 Subject: [PATCH] zoekt-archive-index: split out ranking tests and archive indexing (#712) We had ranking e2e tests living in the zoekt-archive-index cmd for convenience since that contained useful functions for indexing a remote tarball from the GitHub API. This commit splits the archive functionality into a new internal/archive package and the ranking tests into a new internal/e2e package. The zoekt-archive-index code is now quite minimal. This is similiar to how zoekt-git-index mostly just calls out to the gitindex package. What is different is that archive package is marked internal, unlike gitindex. gitindex should also be internal, but the code predates go's support for internal. I suspect more of our e2e tests will end up in this package. Test Plan: go test ./... --- cmd/zoekt-archive-index/main.go | 191 +----------------- .../archive}/archive.go | 7 +- .../archive}/e2e_test.go | 4 +- internal/archive/index.go | 187 +++++++++++++++++ internal/e2e/doc.go | 2 + .../e2e}/e2e_rank_test.go | 9 +- internal/e2e/e2e_test.go | 17 ++ .../e2e}/testdata/Get_databaseuser.txt | 0 .../e2e}/testdata/InternalDoer.txt | 0 .../Repository_metadata_Write_rbac.txt | 0 .../e2e}/testdata/bufio_buffer.txt | 0 .../e2e}/testdata/bytes_buffer.txt | 0 .../e2e}/testdata/generate_unit_test.txt | 0 .../e2e}/testdata/graphql_type_User.txt | 0 .../e2e}/testdata/r_cody_sourcegraph_url.txt | 0 .../e2e}/testdata/test_server.txt | 0 16 files changed, 224 insertions(+), 193 deletions(-) rename {cmd/zoekt-archive-index => internal/archive}/archive.go (95%) rename {cmd/zoekt-archive-index => internal/archive}/e2e_test.go (98%) create mode 100644 internal/archive/index.go create mode 100644 internal/e2e/doc.go rename {cmd/zoekt-archive-index => internal/e2e}/e2e_rank_test.go (97%) create mode 100644 internal/e2e/e2e_test.go rename {cmd/zoekt-archive-index => internal/e2e}/testdata/Get_databaseuser.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/InternalDoer.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/Repository_metadata_Write_rbac.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/bufio_buffer.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/bytes_buffer.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/generate_unit_test.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/graphql_type_User.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/r_cody_sourcegraph_url.txt (100%) rename {cmd/zoekt-archive-index => internal/e2e}/testdata/test_server.txt (100%) diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go index 491233e3a..68817cf9e 100644 --- a/cmd/zoekt-archive-index/main.go +++ b/cmd/zoekt-archive-index/main.go @@ -8,191 +8,14 @@ package main import ( - "errors" "flag" - "fmt" - "io" "log" - "net/url" - "strings" - "github.com/sourcegraph/zoekt" - "github.com/sourcegraph/zoekt/build" - "github.com/sourcegraph/zoekt/cmd" "go.uber.org/automaxprocs/maxprocs" -) - -// stripComponents removes the specified number of leading path -// elements. Pathnames with fewer elements will return the empty string. -func stripComponents(path string, count int) string { - for i := 0; path != "" && i < count; i++ { - i := strings.Index(path, "/") - if i < 0 { - return "" - } - path = path[i+1:] - } - return path -} - -// isGitOID checks if the revision is a git OID SHA string. -// -// Note: This doesn't mean the SHA exists in a repository, nor does it mean it -// isn't a ref. Git allows 40-char hexadecimal strings to be references. -func isGitOID(s string) bool { - if len(s) != 40 { - return false - } - for _, r := range s { - if !(('0' <= r && r <= '9') || - ('a' <= r && r <= 'f') || - ('A' <= r && r <= 'F')) { - return false - } - } - return true -} - -type Options struct { - Incremental bool - Archive string - Name string - RepoURL string - Branch string - Commit string - Strip int -} - -func (o *Options) SetDefaults() { - // We guess based on the archive URL. - u, _ := url.Parse(o.Archive) - if u == nil { - return - } - - setRef := func(ref string) { - if isGitOID(ref) && o.Commit == "" { - o.Commit = ref - } - if !isGitOID(ref) && o.Branch == "" { - o.Branch = ref - } - } - - switch u.Host { - case "github.com", "codeload.github.com": - // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 - // https://github.com/octokit/octokit.rb/blob/master/README.md - // https://github.com/octokit/octokit.rb/tree/master/lib - // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 4 { - setRef(parts[4]) - if u.Host == "github.com" { - o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) - } - } - o.Strip = 1 - case "api.github.com": - // https://api.github.com/repos/octokit/octokit.rb/tarball/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 5 { - setRef(parts[5]) - } - o.Strip = 1 - } -} - -func do(opts Options, bopts build.Options) error { - opts.SetDefaults() - - if opts.Name == "" && opts.RepoURL == "" { - return errors.New("-name or -url required") - } - if opts.Branch == "" { - return errors.New("-branch required") - } - - if opts.Name != "" { - bopts.RepositoryDescription.Name = opts.Name - } - // We do not use this functionality to avoid pulling in the transitive deps of gitindex - /* - if opts.RepoURL != "" { - u, err := url.Parse(opts.RepoURL) - if err != nil { - return err - } - if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { - return err - } - } - */ - bopts.SetDefaults() - bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} - brs := []string{opts.Branch} - - if opts.Incremental && bopts.IncrementalSkipIndexing() { - return nil - } - - a, err := openArchive(opts.Archive) - if err != nil { - return err - } - defer a.Close() - - bopts.RepositoryDescription.Source = opts.Archive - builder, err := build.NewBuilder(bopts) - if err != nil { - return err - } - - add := func(f *File) error { - defer f.Close() - - contents, err := io.ReadAll(f) - if err != nil { - return err - } - - name := stripComponents(f.Name, opts.Strip) - if name == "" { - return nil - } - - return builder.Add(zoekt.Document{ - Name: name, - Content: contents, - Branches: brs, - }) - } - - for { - f, err := a.Next() - if err == io.EOF { - break - } - if err != nil { - return err - } - - if err := add(f); err != nil { - return err - } - } - - return builder.Finish() -} + "github.com/sourcegraph/zoekt/cmd" + "github.com/sourcegraph/zoekt/internal/archive" +) func main() { var ( @@ -216,12 +39,12 @@ func main() { if len(flag.Args()) != 1 { log.Fatal("expected argument for archive location") } - archive := flag.Args()[0] + archiveURL := flag.Args()[0] bopts := cmd.OptionsFromFlags() - opts := Options{ + opts := archive.Options{ Incremental: *incremental, - Archive: archive, + Archive: archiveURL, Name: *name, RepoURL: *urlRaw, Branch: *branch, @@ -232,7 +55,7 @@ func main() { // Sourcegraph specific: Limit HTTP traffic limitHTTPDefaultClient(*downloadLimitMbps) - if err := do(opts, *bopts); err != nil { + if err := archive.Index(opts, *bopts); err != nil { log.Fatal(err) } } diff --git a/cmd/zoekt-archive-index/archive.go b/internal/archive/archive.go similarity index 95% rename from cmd/zoekt-archive-index/archive.go rename to internal/archive/archive.go index ad5ca4fc1..2c8cb64fa 100644 --- a/cmd/zoekt-archive-index/archive.go +++ b/internal/archive/archive.go @@ -1,4 +1,4 @@ -package main +package archive import ( "archive/tar" @@ -126,7 +126,8 @@ func detectContentType(r io.Reader) (string, io.Reader, error) { return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil } -func openReader(u string) (io.ReadCloser, error) { +// OpenReader returns a reader for the archive at the URL u. +func OpenReader(u string) (io.ReadCloser, error) { if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") { resp, err := http.Get(u) if err != nil { @@ -155,7 +156,7 @@ func openReader(u string) (io.ReadCloser, error) { // openArchive opens the tar at the URL or filepath u. Also supported is tgz // files over http. func openArchive(u string) (ar Archive, err error) { - readCloser, err := openReader(u) + readCloser, err := OpenReader(u) if err != nil { return nil, err } diff --git a/cmd/zoekt-archive-index/e2e_test.go b/internal/archive/e2e_test.go similarity index 98% rename from cmd/zoekt-archive-index/e2e_test.go rename to internal/archive/e2e_test.go index 8854b8a37..79b3ba538 100644 --- a/cmd/zoekt-archive-index/e2e_test.go +++ b/internal/archive/e2e_test.go @@ -1,4 +1,4 @@ -package main +package archive import ( "archive/tar" @@ -163,7 +163,7 @@ func testIndexIncrementally(t *testing.T, format string) { Strip: 0, } - if err := do(opts, bopts); err != nil { + if err := Index(opts, bopts); err != nil { t.Fatalf("error creating index: %v", err) } diff --git a/internal/archive/index.go b/internal/archive/index.go new file mode 100644 index 000000000..2262c9521 --- /dev/null +++ b/internal/archive/index.go @@ -0,0 +1,187 @@ +// package archive provides indexing of archives from remote URLs. +package archive + +import ( + "errors" + "fmt" + "io" + "net/url" + "strings" + + "github.com/sourcegraph/zoekt" + "github.com/sourcegraph/zoekt/build" +) + +// Options specify the archive specific indexing options. +type Options struct { + Incremental bool + + Archive string + Name string + RepoURL string + Branch string + Commit string + Strip int +} + +func (o *Options) SetDefaults() { + // We guess based on the archive URL. + u, _ := url.Parse(o.Archive) + if u == nil { + return + } + + setRef := func(ref string) { + if isGitOID(ref) && o.Commit == "" { + o.Commit = ref + } + if !isGitOID(ref) && o.Branch == "" { + o.Branch = ref + } + } + + switch u.Host { + case "github.com", "codeload.github.com": + // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 + // https://github.com/octokit/octokit.rb/blob/master/README.md + // https://github.com/octokit/octokit.rb/tree/master/lib + // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master + parts := strings.Split(u.Path, "/") + if len(parts) > 2 && o.Name == "" { + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) + } + if len(parts) > 4 { + setRef(parts[4]) + if u.Host == "github.com" { + o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) + } + } + o.Strip = 1 + case "api.github.com": + // https://api.github.com/repos/octokit/octokit.rb/tarball/master + parts := strings.Split(u.Path, "/") + if len(parts) > 2 && o.Name == "" { + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) + } + if len(parts) > 5 { + setRef(parts[5]) + } + o.Strip = 1 + } +} + +// Index archive specified in opts using bopts. +func Index(opts Options, bopts build.Options) error { + opts.SetDefaults() + + if opts.Name == "" && opts.RepoURL == "" { + return errors.New("-name or -url required") + } + if opts.Branch == "" { + return errors.New("-branch required") + } + + if opts.Name != "" { + bopts.RepositoryDescription.Name = opts.Name + } + // We do not use this functionality to avoid pulling in the transitive deps of gitindex + /* + if opts.RepoURL != "" { + u, err := url.Parse(opts.RepoURL) + if err != nil { + return err + } + if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { + return err + } + } + */ + bopts.SetDefaults() + bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} + brs := []string{opts.Branch} + + if opts.Incremental && bopts.IncrementalSkipIndexing() { + return nil + } + + a, err := openArchive(opts.Archive) + if err != nil { + return err + } + defer a.Close() + + bopts.RepositoryDescription.Source = opts.Archive + builder, err := build.NewBuilder(bopts) + if err != nil { + return err + } + + add := func(f *File) error { + defer f.Close() + + contents, err := io.ReadAll(f) + if err != nil { + return err + } + + name := stripComponents(f.Name, opts.Strip) + if name == "" { + return nil + } + + return builder.Add(zoekt.Document{ + Name: name, + Content: contents, + Branches: brs, + }) + } + + for { + f, err := a.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if err := add(f); err != nil { + return err + } + } + + return builder.Finish() +} + +// stripComponents removes the specified number of leading path +// elements. Pathnames with fewer elements will return the empty string. +func stripComponents(path string, count int) string { + for i := 0; path != "" && i < count; i++ { + i := strings.Index(path, "/") + if i < 0 { + return "" + } + path = path[i+1:] + } + return path +} + +// isGitOID checks if the revision is a git OID SHA string. +// +// Note: This doesn't mean the SHA exists in a repository, nor does it mean it +// isn't a ref. Git allows 40-char hexadecimal strings to be references. +func isGitOID(s string) bool { + if len(s) != 40 { + return false + } + for _, r := range s { + if !(('0' <= r && r <= '9') || + ('a' <= r && r <= 'f') || + ('A' <= r && r <= 'F')) { + return false + } + } + return true +} diff --git a/internal/e2e/doc.go b/internal/e2e/doc.go new file mode 100644 index 000000000..291d8bd11 --- /dev/null +++ b/internal/e2e/doc.go @@ -0,0 +1,2 @@ +// package e2e contains end to end tests +package e2e diff --git a/cmd/zoekt-archive-index/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go similarity index 97% rename from cmd/zoekt-archive-index/e2e_rank_test.go rename to internal/e2e/e2e_rank_test.go index bcb826838..4b56909b8 100644 --- a/cmd/zoekt-archive-index/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -1,4 +1,4 @@ -package main +package e2e import ( "bytes" @@ -17,6 +17,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/build" + "github.com/sourcegraph/zoekt/internal/archive" "github.com/sourcegraph/zoekt/query" "github.com/sourcegraph/zoekt/shards" ) @@ -135,7 +136,7 @@ func indexURL(indexDir, u string) error { return err } - opts := Options{ + opts := archive.Options{ Archive: u, } opts.SetDefaults() // sets metadata like Name and the codeload URL @@ -158,7 +159,7 @@ func indexURL(indexDir, u string) error { // languageMap[lang] = ctags.ScipCTags // } - err := do(opts, build.Options{ + err := archive.Index(opts, build.Options{ IndexDir: indexDir, CTagsMustSucceed: true, }) @@ -172,7 +173,7 @@ func indexURL(indexDir, u string) error { func download(url, dst string) error { tmpPath := dst + ".part" - rc, err := openReader(url) + rc, err := archive.OpenReader(url) if err != nil { return err } diff --git a/internal/e2e/e2e_test.go b/internal/e2e/e2e_test.go new file mode 100644 index 000000000..911f0d4db --- /dev/null +++ b/internal/e2e/e2e_test.go @@ -0,0 +1,17 @@ +package e2e + +import ( + "flag" + "io" + "log" + "os" + "testing" +) + +func TestMain(m *testing.M) { + flag.Parse() + if !testing.Verbose() { + log.SetOutput(io.Discard) + } + os.Exit(m.Run()) +} diff --git a/cmd/zoekt-archive-index/testdata/Get_databaseuser.txt b/internal/e2e/testdata/Get_databaseuser.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/Get_databaseuser.txt rename to internal/e2e/testdata/Get_databaseuser.txt diff --git a/cmd/zoekt-archive-index/testdata/InternalDoer.txt b/internal/e2e/testdata/InternalDoer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/InternalDoer.txt rename to internal/e2e/testdata/InternalDoer.txt diff --git a/cmd/zoekt-archive-index/testdata/Repository_metadata_Write_rbac.txt b/internal/e2e/testdata/Repository_metadata_Write_rbac.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/Repository_metadata_Write_rbac.txt rename to internal/e2e/testdata/Repository_metadata_Write_rbac.txt diff --git a/cmd/zoekt-archive-index/testdata/bufio_buffer.txt b/internal/e2e/testdata/bufio_buffer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/bufio_buffer.txt rename to internal/e2e/testdata/bufio_buffer.txt diff --git a/cmd/zoekt-archive-index/testdata/bytes_buffer.txt b/internal/e2e/testdata/bytes_buffer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/bytes_buffer.txt rename to internal/e2e/testdata/bytes_buffer.txt diff --git a/cmd/zoekt-archive-index/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/generate_unit_test.txt rename to internal/e2e/testdata/generate_unit_test.txt diff --git a/cmd/zoekt-archive-index/testdata/graphql_type_User.txt b/internal/e2e/testdata/graphql_type_User.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/graphql_type_User.txt rename to internal/e2e/testdata/graphql_type_User.txt diff --git a/cmd/zoekt-archive-index/testdata/r_cody_sourcegraph_url.txt b/internal/e2e/testdata/r_cody_sourcegraph_url.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/r_cody_sourcegraph_url.txt rename to internal/e2e/testdata/r_cody_sourcegraph_url.txt diff --git a/cmd/zoekt-archive-index/testdata/test_server.txt b/internal/e2e/testdata/test_server.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/test_server.txt rename to internal/e2e/testdata/test_server.txt