Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zoekt-archive-index: split out ranking tests and archive indexing #712

Merged
merged 1 commit into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 7 additions & 184 deletions cmd/zoekt-archive-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,191 +8,14 @@
package main

import (
"errors"
"flag"
"fmt"
"io"
"log"
"net/url"
"strings"

"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/build"
"github.com/sourcegraph/zoekt/cmd"
"go.uber.org/automaxprocs/maxprocs"
)

// stripComponents removes the specified number of leading path
// elements. Pathnames with fewer elements will return the empty string.
func stripComponents(path string, count int) string {
for i := 0; path != "" && i < count; i++ {
i := strings.Index(path, "/")
if i < 0 {
return ""
}
path = path[i+1:]
}
return path
}

// isGitOID checks if the revision is a git OID SHA string.
//
// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
// isn't a ref. Git allows 40-char hexadecimal strings to be references.
func isGitOID(s string) bool {
if len(s) != 40 {
return false
}
for _, r := range s {
if !(('0' <= r && r <= '9') ||
('a' <= r && r <= 'f') ||
('A' <= r && r <= 'F')) {
return false
}
}
return true
}

type Options struct {
Incremental bool

Archive string
Name string
RepoURL string
Branch string
Commit string
Strip int
}

func (o *Options) SetDefaults() {
// We guess based on the archive URL.
u, _ := url.Parse(o.Archive)
if u == nil {
return
}

setRef := func(ref string) {
if isGitOID(ref) && o.Commit == "" {
o.Commit = ref
}
if !isGitOID(ref) && o.Branch == "" {
o.Branch = ref
}
}

switch u.Host {
case "github.com", "codeload.github.com":
// https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
// https://github.com/octokit/octokit.rb/blob/master/README.md
// https://github.com/octokit/octokit.rb/tree/master/lib
// https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 4 {
setRef(parts[4])
if u.Host == "github.com" {
o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
}
}
o.Strip = 1
case "api.github.com":
// https://api.github.com/repos/octokit/octokit.rb/tarball/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 5 {
setRef(parts[5])
}
o.Strip = 1
}
}

func do(opts Options, bopts build.Options) error {
opts.SetDefaults()

if opts.Name == "" && opts.RepoURL == "" {
return errors.New("-name or -url required")
}
if opts.Branch == "" {
return errors.New("-branch required")
}

if opts.Name != "" {
bopts.RepositoryDescription.Name = opts.Name
}
// We do not use this functionality to avoid pulling in the transitive deps of gitindex
/*
if opts.RepoURL != "" {
u, err := url.Parse(opts.RepoURL)
if err != nil {
return err
}
if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
return err
}
}
*/
bopts.SetDefaults()
bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
brs := []string{opts.Branch}

if opts.Incremental && bopts.IncrementalSkipIndexing() {
return nil
}

a, err := openArchive(opts.Archive)
if err != nil {
return err
}
defer a.Close()

bopts.RepositoryDescription.Source = opts.Archive
builder, err := build.NewBuilder(bopts)
if err != nil {
return err
}

add := func(f *File) error {
defer f.Close()

contents, err := io.ReadAll(f)
if err != nil {
return err
}

name := stripComponents(f.Name, opts.Strip)
if name == "" {
return nil
}

return builder.Add(zoekt.Document{
Name: name,
Content: contents,
Branches: brs,
})
}

for {
f, err := a.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}

if err := add(f); err != nil {
return err
}
}

return builder.Finish()
}
"github.com/sourcegraph/zoekt/cmd"
"github.com/sourcegraph/zoekt/internal/archive"
)

func main() {
var (
Expand All @@ -216,12 +39,12 @@ func main() {
if len(flag.Args()) != 1 {
log.Fatal("expected argument for archive location")
}
archive := flag.Args()[0]
archiveURL := flag.Args()[0]
bopts := cmd.OptionsFromFlags()
opts := Options{
opts := archive.Options{
Incremental: *incremental,

Archive: archive,
Archive: archiveURL,
Name: *name,
RepoURL: *urlRaw,
Branch: *branch,
Expand All @@ -232,7 +55,7 @@ func main() {
// Sourcegraph specific: Limit HTTP traffic
limitHTTPDefaultClient(*downloadLimitMbps)

if err := do(opts, *bopts); err != nil {
if err := archive.Index(opts, *bopts); err != nil {
log.Fatal(err)
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package archive

import (
"archive/tar"
Expand Down Expand Up @@ -126,7 +126,8 @@ func detectContentType(r io.Reader) (string, io.Reader, error) {
return ct, io.MultiReader(bytes.NewReader(buf[:n]), r), nil
}

func openReader(u string) (io.ReadCloser, error) {
// OpenReader returns a reader for the archive at the URL u.
func OpenReader(u string) (io.ReadCloser, error) {
if strings.HasPrefix(u, "https://") || strings.HasPrefix(u, "http://") {
resp, err := http.Get(u)
if err != nil {
Expand Down Expand Up @@ -155,7 +156,7 @@ func openReader(u string) (io.ReadCloser, error) {
// openArchive opens the tar at the URL or filepath u. Also supported is tgz
// files over http.
func openArchive(u string) (ar Archive, err error) {
readCloser, err := openReader(u)
readCloser, err := OpenReader(u)
if err != nil {
return nil, err
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package archive

import (
"archive/tar"
Expand Down Expand Up @@ -163,7 +163,7 @@ func testIndexIncrementally(t *testing.T, format string) {
Strip: 0,
}

if err := do(opts, bopts); err != nil {
if err := Index(opts, bopts); err != nil {
t.Fatalf("error creating index: %v", err)
}

Expand Down
Loading
Loading