diff --git a/cmd/zoekt-archive-index/main.go b/cmd/zoekt-archive-index/main.go index 491233e3a..68817cf9e 100644 --- a/cmd/zoekt-archive-index/main.go +++ b/cmd/zoekt-archive-index/main.go @@ -8,191 +8,14 @@ package main import ( - "errors" "flag" - "fmt" - "io" "log" - "net/url" - "strings" - "github.com/sourcegraph/zoekt" - "github.com/sourcegraph/zoekt/build" - "github.com/sourcegraph/zoekt/cmd" "go.uber.org/automaxprocs/maxprocs" -) - -// stripComponents removes the specified number of leading path -// elements. Pathnames with fewer elements will return the empty string. -func stripComponents(path string, count int) string { - for i := 0; path != "" && i < count; i++ { - i := strings.Index(path, "/") - if i < 0 { - return "" - } - path = path[i+1:] - } - return path -} - -// isGitOID checks if the revision is a git OID SHA string. -// -// Note: This doesn't mean the SHA exists in a repository, nor does it mean it -// isn't a ref. Git allows 40-char hexadecimal strings to be references. -func isGitOID(s string) bool { - if len(s) != 40 { - return false - } - for _, r := range s { - if !(('0' <= r && r <= '9') || - ('a' <= r && r <= 'f') || - ('A' <= r && r <= 'F')) { - return false - } - } - return true -} - -type Options struct { - Incremental bool - Archive string - Name string - RepoURL string - Branch string - Commit string - Strip int -} - -func (o *Options) SetDefaults() { - // We guess based on the archive URL. - u, _ := url.Parse(o.Archive) - if u == nil { - return - } - - setRef := func(ref string) { - if isGitOID(ref) && o.Commit == "" { - o.Commit = ref - } - if !isGitOID(ref) && o.Branch == "" { - o.Branch = ref - } - } - - switch u.Host { - case "github.com", "codeload.github.com": - // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 - // https://github.com/octokit/octokit.rb/blob/master/README.md - // https://github.com/octokit/octokit.rb/tree/master/lib - // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 4 { - setRef(parts[4]) - if u.Host == "github.com" { - o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) - } - } - o.Strip = 1 - case "api.github.com": - // https://api.github.com/repos/octokit/octokit.rb/tarball/master - parts := strings.Split(u.Path, "/") - if len(parts) > 2 && o.Name == "" { - o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) - o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) - } - if len(parts) > 5 { - setRef(parts[5]) - } - o.Strip = 1 - } -} - -func do(opts Options, bopts build.Options) error { - opts.SetDefaults() - - if opts.Name == "" && opts.RepoURL == "" { - return errors.New("-name or -url required") - } - if opts.Branch == "" { - return errors.New("-branch required") - } - - if opts.Name != "" { - bopts.RepositoryDescription.Name = opts.Name - } - // We do not use this functionality to avoid pulling in the transitive deps of gitindex - /* - if opts.RepoURL != "" { - u, err := url.Parse(opts.RepoURL) - if err != nil { - return err - } - if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { - return err - } - } - */ - bopts.SetDefaults() - bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} - brs := []string{opts.Branch} - - if opts.Incremental && bopts.IncrementalSkipIndexing() { - return nil - } - - a, err := openArchive(opts.Archive) - if err != nil { - return err - } - defer a.Close() - - bopts.RepositoryDescription.Source = opts.Archive - builder, err := build.NewBuilder(bopts) - if err != nil { - return err - } - - add := func(f *File) error { - defer f.Close() - - contents, err := io.ReadAll(f) - if err != nil { - return err - } - - name := stripComponents(f.Name, opts.Strip) - if name == "" { - return nil - } - - return builder.Add(zoekt.Document{ - Name: name, - Content: contents, - Branches: brs, - }) - } - - for { - f, err := a.Next() - if err == io.EOF { - break - } - if err != nil { - return err - } - - if err := add(f); err != nil { - return err - } - } - - return builder.Finish() -} + "github.com/sourcegraph/zoekt/cmd" + "github.com/sourcegraph/zoekt/internal/archive" +) func main() { var ( @@ -216,12 +39,12 @@ func main() { if len(flag.Args()) != 1 { log.Fatal("expected argument for archive location") } - archive := flag.Args()[0] + archiveURL := flag.Args()[0] bopts := cmd.OptionsFromFlags() - opts := Options{ + opts := archive.Options{ Incremental: *incremental, - Archive: archive, + Archive: archiveURL, Name: *name, RepoURL: *urlRaw, Branch: *branch, @@ -232,7 +55,7 @@ func main() { // Sourcegraph specific: Limit HTTP traffic limitHTTPDefaultClient(*downloadLimitMbps) - if err := do(opts, *bopts); err != nil { + if err := archive.Index(opts, *bopts); err != nil { log.Fatal(err) } } diff --git a/cmd/zoekt-archive-index/archive.go b/internal/archive/archive.go similarity index 95% rename from cmd/zoekt-archive-index/archive.go rename to internal/archive/archive.go index ad5ca4fc1..2c8cb64fa 100644 --- a/cmd/zoekt-archive-index/archive.go +++ b/internal/archive/archive.go @@ -1,4 +1,4 @@ -package main +package archive import ( "archive/tar" @@ -126,7 +126,8 @@ func detectContentType(r io.Reader) (string, io.Reader, error) { return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil } -func openReader(u string) (io.ReadCloser, error) { +// OpenReader returns a reader for the archive at the URL u. +func OpenReader(u string) (io.ReadCloser, error) { if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") { resp, err := http.Get(u) if err != nil { @@ -155,7 +156,7 @@ func openReader(u string) (io.ReadCloser, error) { // openArchive opens the tar at the URL or filepath u. Also supported is tgz // files over http. func openArchive(u string) (ar Archive, err error) { - readCloser, err := openReader(u) + readCloser, err := OpenReader(u) if err != nil { return nil, err } diff --git a/cmd/zoekt-archive-index/e2e_test.go b/internal/archive/e2e_test.go similarity index 98% rename from cmd/zoekt-archive-index/e2e_test.go rename to internal/archive/e2e_test.go index 8854b8a37..79b3ba538 100644 --- a/cmd/zoekt-archive-index/e2e_test.go +++ b/internal/archive/e2e_test.go @@ -1,4 +1,4 @@ -package main +package archive import ( "archive/tar" @@ -163,7 +163,7 @@ func testIndexIncrementally(t *testing.T, format string) { Strip: 0, } - if err := do(opts, bopts); err != nil { + if err := Index(opts, bopts); err != nil { t.Fatalf("error creating index: %v", err) } diff --git a/internal/archive/index.go b/internal/archive/index.go new file mode 100644 index 000000000..2262c9521 --- /dev/null +++ b/internal/archive/index.go @@ -0,0 +1,187 @@ +// package archive provides indexing of archives from remote URLs. +package archive + +import ( + "errors" + "fmt" + "io" + "net/url" + "strings" + + "github.com/sourcegraph/zoekt" + "github.com/sourcegraph/zoekt/build" +) + +// Options specify the archive specific indexing options. +type Options struct { + Incremental bool + + Archive string + Name string + RepoURL string + Branch string + Commit string + Strip int +} + +func (o *Options) SetDefaults() { + // We guess based on the archive URL. + u, _ := url.Parse(o.Archive) + if u == nil { + return + } + + setRef := func(ref string) { + if isGitOID(ref) && o.Commit == "" { + o.Commit = ref + } + if !isGitOID(ref) && o.Branch == "" { + o.Branch = ref + } + } + + switch u.Host { + case "github.com", "codeload.github.com": + // https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1 + // https://github.com/octokit/octokit.rb/blob/master/README.md + // https://github.com/octokit/octokit.rb/tree/master/lib + // https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master + parts := strings.Split(u.Path, "/") + if len(parts) > 2 && o.Name == "" { + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) + } + if len(parts) > 4 { + setRef(parts[4]) + if u.Host == "github.com" { + o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4]) + } + } + o.Strip = 1 + case "api.github.com": + // https://api.github.com/repos/octokit/octokit.rb/tarball/master + parts := strings.Split(u.Path, "/") + if len(parts) > 2 && o.Name == "" { + o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2]) + o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2]) + } + if len(parts) > 5 { + setRef(parts[5]) + } + o.Strip = 1 + } +} + +// Index archive specified in opts using bopts. +func Index(opts Options, bopts build.Options) error { + opts.SetDefaults() + + if opts.Name == "" && opts.RepoURL == "" { + return errors.New("-name or -url required") + } + if opts.Branch == "" { + return errors.New("-branch required") + } + + if opts.Name != "" { + bopts.RepositoryDescription.Name = opts.Name + } + // We do not use this functionality to avoid pulling in the transitive deps of gitindex + /* + if opts.RepoURL != "" { + u, err := url.Parse(opts.RepoURL) + if err != nil { + return err + } + if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil { + return err + } + } + */ + bopts.SetDefaults() + bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}} + brs := []string{opts.Branch} + + if opts.Incremental && bopts.IncrementalSkipIndexing() { + return nil + } + + a, err := openArchive(opts.Archive) + if err != nil { + return err + } + defer a.Close() + + bopts.RepositoryDescription.Source = opts.Archive + builder, err := build.NewBuilder(bopts) + if err != nil { + return err + } + + add := func(f *File) error { + defer f.Close() + + contents, err := io.ReadAll(f) + if err != nil { + return err + } + + name := stripComponents(f.Name, opts.Strip) + if name == "" { + return nil + } + + return builder.Add(zoekt.Document{ + Name: name, + Content: contents, + Branches: brs, + }) + } + + for { + f, err := a.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if err := add(f); err != nil { + return err + } + } + + return builder.Finish() +} + +// stripComponents removes the specified number of leading path +// elements. Pathnames with fewer elements will return the empty string. +func stripComponents(path string, count int) string { + for i := 0; path != "" && i < count; i++ { + i := strings.Index(path, "/") + if i < 0 { + return "" + } + path = path[i+1:] + } + return path +} + +// isGitOID checks if the revision is a git OID SHA string. +// +// Note: This doesn't mean the SHA exists in a repository, nor does it mean it +// isn't a ref. Git allows 40-char hexadecimal strings to be references. +func isGitOID(s string) bool { + if len(s) != 40 { + return false + } + for _, r := range s { + if !(('0' <= r && r <= '9') || + ('a' <= r && r <= 'f') || + ('A' <= r && r <= 'F')) { + return false + } + } + return true +} diff --git a/internal/e2e/doc.go b/internal/e2e/doc.go new file mode 100644 index 000000000..291d8bd11 --- /dev/null +++ b/internal/e2e/doc.go @@ -0,0 +1,2 @@ +// package e2e contains end to end tests +package e2e diff --git a/cmd/zoekt-archive-index/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go similarity index 97% rename from cmd/zoekt-archive-index/e2e_rank_test.go rename to internal/e2e/e2e_rank_test.go index bcb826838..4b56909b8 100644 --- a/cmd/zoekt-archive-index/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -1,4 +1,4 @@ -package main +package e2e import ( "bytes" @@ -17,6 +17,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/sourcegraph/zoekt" "github.com/sourcegraph/zoekt/build" + "github.com/sourcegraph/zoekt/internal/archive" "github.com/sourcegraph/zoekt/query" "github.com/sourcegraph/zoekt/shards" ) @@ -135,7 +136,7 @@ func indexURL(indexDir, u string) error { return err } - opts := Options{ + opts := archive.Options{ Archive: u, } opts.SetDefaults() // sets metadata like Name and the codeload URL @@ -158,7 +159,7 @@ func indexURL(indexDir, u string) error { // languageMap[lang] = ctags.ScipCTags // } - err := do(opts, build.Options{ + err := archive.Index(opts, build.Options{ IndexDir: indexDir, CTagsMustSucceed: true, }) @@ -172,7 +173,7 @@ func indexURL(indexDir, u string) error { func download(url, dst string) error { tmpPath := dst + ".part" - rc, err := openReader(url) + rc, err := archive.OpenReader(url) if err != nil { return err } diff --git a/internal/e2e/e2e_test.go b/internal/e2e/e2e_test.go new file mode 100644 index 000000000..911f0d4db --- /dev/null +++ b/internal/e2e/e2e_test.go @@ -0,0 +1,17 @@ +package e2e + +import ( + "flag" + "io" + "log" + "os" + "testing" +) + +func TestMain(m *testing.M) { + flag.Parse() + if !testing.Verbose() { + log.SetOutput(io.Discard) + } + os.Exit(m.Run()) +} diff --git a/cmd/zoekt-archive-index/testdata/Get_databaseuser.txt b/internal/e2e/testdata/Get_databaseuser.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/Get_databaseuser.txt rename to internal/e2e/testdata/Get_databaseuser.txt diff --git a/cmd/zoekt-archive-index/testdata/InternalDoer.txt b/internal/e2e/testdata/InternalDoer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/InternalDoer.txt rename to internal/e2e/testdata/InternalDoer.txt diff --git a/cmd/zoekt-archive-index/testdata/Repository_metadata_Write_rbac.txt b/internal/e2e/testdata/Repository_metadata_Write_rbac.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/Repository_metadata_Write_rbac.txt rename to internal/e2e/testdata/Repository_metadata_Write_rbac.txt diff --git a/cmd/zoekt-archive-index/testdata/bufio_buffer.txt b/internal/e2e/testdata/bufio_buffer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/bufio_buffer.txt rename to internal/e2e/testdata/bufio_buffer.txt diff --git a/cmd/zoekt-archive-index/testdata/bytes_buffer.txt b/internal/e2e/testdata/bytes_buffer.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/bytes_buffer.txt rename to internal/e2e/testdata/bytes_buffer.txt diff --git a/cmd/zoekt-archive-index/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/generate_unit_test.txt rename to internal/e2e/testdata/generate_unit_test.txt diff --git a/cmd/zoekt-archive-index/testdata/graphql_type_User.txt b/internal/e2e/testdata/graphql_type_User.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/graphql_type_User.txt rename to internal/e2e/testdata/graphql_type_User.txt diff --git a/cmd/zoekt-archive-index/testdata/r_cody_sourcegraph_url.txt b/internal/e2e/testdata/r_cody_sourcegraph_url.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/r_cody_sourcegraph_url.txt rename to internal/e2e/testdata/r_cody_sourcegraph_url.txt diff --git a/cmd/zoekt-archive-index/testdata/test_server.txt b/internal/e2e/testdata/test_server.txt similarity index 100% rename from cmd/zoekt-archive-index/testdata/test_server.txt rename to internal/e2e/testdata/test_server.txt