Skip to content

Commit

Permalink
zoekt-archive-index: split out ranking tests and archive indexing (#712)
Browse files Browse the repository at this point in the history
We had ranking e2e tests living in the zoekt-archive-index cmd for
convenience since that contained useful functions for indexing a remote
tarball from the GitHub API. This commit splits the archive
functionality into a new internal/archive package and the ranking tests
into a new internal/e2e package.

The zoekt-archive-index code is now quite minimal. This is similiar to
how zoekt-git-index mostly just calls out to the gitindex package. What
is different is that archive package is marked internal, unlike
gitindex. gitindex should also be internal, but the code predates go's
support for internal.

I suspect more of our e2e tests will end up in this package.

Test Plan: go test ./...
  • Loading branch information
keegancsmith authored Jan 10, 2024
1 parent 7487a0d commit 155050e
Show file tree
Hide file tree
Showing 16 changed files with 224 additions and 193 deletions.
191 changes: 7 additions & 184 deletions cmd/zoekt-archive-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,191 +8,14 @@
package main

import (
"errors"
"flag"
"fmt"
"io"
"log"
"net/url"
"strings"

"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/build"
"github.com/sourcegraph/zoekt/cmd"
"go.uber.org/automaxprocs/maxprocs"
)

// stripComponents removes the specified number of leading path
// elements. Pathnames with fewer elements will return the empty string.
func stripComponents(path string, count int) string {
for i := 0; path != "" && i < count; i++ {
i := strings.Index(path, "/")
if i < 0 {
return ""
}
path = path[i+1:]
}
return path
}

// isGitOID checks if the revision is a git OID SHA string.
//
// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
// isn't a ref. Git allows 40-char hexadecimal strings to be references.
func isGitOID(s string) bool {
if len(s) != 40 {
return false
}
for _, r := range s {
if !(('0' <= r && r <= '9') ||
('a' <= r && r <= 'f') ||
('A' <= r && r <= 'F')) {
return false
}
}
return true
}

type Options struct {
Incremental bool

Archive string
Name string
RepoURL string
Branch string
Commit string
Strip int
}

func (o *Options) SetDefaults() {
// We guess based on the archive URL.
u, _ := url.Parse(o.Archive)
if u == nil {
return
}

setRef := func(ref string) {
if isGitOID(ref) && o.Commit == "" {
o.Commit = ref
}
if !isGitOID(ref) && o.Branch == "" {
o.Branch = ref
}
}

switch u.Host {
case "github.com", "codeload.github.com":
// https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
// https://github.com/octokit/octokit.rb/blob/master/README.md
// https://github.com/octokit/octokit.rb/tree/master/lib
// https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 4 {
setRef(parts[4])
if u.Host == "github.com" {
o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
}
}
o.Strip = 1
case "api.github.com":
// https://api.github.com/repos/octokit/octokit.rb/tarball/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 5 {
setRef(parts[5])
}
o.Strip = 1
}
}

func do(opts Options, bopts build.Options) error {
opts.SetDefaults()

if opts.Name == "" && opts.RepoURL == "" {
return errors.New("-name or -url required")
}
if opts.Branch == "" {
return errors.New("-branch required")
}

if opts.Name != "" {
bopts.RepositoryDescription.Name = opts.Name
}
// We do not use this functionality to avoid pulling in the transitive deps of gitindex
/*
if opts.RepoURL != "" {
u, err := url.Parse(opts.RepoURL)
if err != nil {
return err
}
if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
return err
}
}
*/
bopts.SetDefaults()
bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
brs := []string{opts.Branch}

if opts.Incremental && bopts.IncrementalSkipIndexing() {
return nil
}

a, err := openArchive(opts.Archive)
if err != nil {
return err
}
defer a.Close()

bopts.RepositoryDescription.Source = opts.Archive
builder, err := build.NewBuilder(bopts)
if err != nil {
return err
}

add := func(f *File) error {
defer f.Close()

contents, err := io.ReadAll(f)
if err != nil {
return err
}

name := stripComponents(f.Name, opts.Strip)
if name == "" {
return nil
}

return builder.Add(zoekt.Document{
Name: name,
Content: contents,
Branches: brs,
})
}

for {
f, err := a.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}

if err := add(f); err != nil {
return err
}
}

return builder.Finish()
}
"github.com/sourcegraph/zoekt/cmd"
"github.com/sourcegraph/zoekt/internal/archive"
)

func main() {
var (
Expand All @@ -216,12 +39,12 @@ func main() {
if len(flag.Args()) != 1 {
log.Fatal("expected argument for archive location")
}
archive := flag.Args()[0]
archiveURL := flag.Args()[0]
bopts := cmd.OptionsFromFlags()
opts := Options{
opts := archive.Options{
Incremental: *incremental,

Archive: archive,
Archive: archiveURL,
Name: *name,
RepoURL: *urlRaw,
Branch: *branch,
Expand All @@ -232,7 +55,7 @@ func main() {
// Sourcegraph specific: Limit HTTP traffic
limitHTTPDefaultClient(*downloadLimitMbps)

if err := do(opts, *bopts); err != nil {
if err := archive.Index(opts, *bopts); err != nil {
log.Fatal(err)
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package archive

import (
"archive/tar"
Expand Down Expand Up @@ -126,7 +126,8 @@ func detectContentType(r io.Reader) (string, io.Reader, error) {
return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil
}

func openReader(u string) (io.ReadCloser, error) {
// OpenReader returns a reader for the archive at the URL u.
func OpenReader(u string) (io.ReadCloser, error) {
if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") {
resp, err := http.Get(u)
if err != nil {
Expand Down Expand Up @@ -155,7 +156,7 @@ func openReader(u string) (io.ReadCloser, error) {
// openArchive opens the tar at the URL or filepath u. Also supported is tgz
// files over http.
func openArchive(u string) (ar Archive, err error) {
readCloser, err := openReader(u)
readCloser, err := OpenReader(u)
if err != nil {
return nil, err
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package archive

import (
"archive/tar"
Expand Down Expand Up @@ -163,7 +163,7 @@ func testIndexIncrementally(t *testing.T, format string) {
Strip: 0,
}

if err := do(opts, bopts); err != nil {
if err := Index(opts, bopts); err != nil {
t.Fatalf("error creating index: %v", err)
}

Expand Down
Loading

0 comments on commit 155050e

Please sign in to comment.