Skip to content

Commit

Permalink
archive: e2e test for ranking against sourcegraph repo
Browse files Browse the repository at this point in the history
This is an initial framework for having golden file results for search
results against a real repository. At first we have only added one query
and one repository, but it should be straightforward to grow this list
further.

The golden files we write to disk are a summary of results with debug
information. This matches how we have been using the zoekt CLI tool on
the keyword branch during our ranking work.

Test Plan: go test
  • Loading branch information
keegancsmith committed Nov 15, 2023
1 parent 137eb8f commit 9dad685
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
221 changes: 221 additions & 0 deletions cmd/zoekt-archive-index/e2e_rank_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
package main

import (
"bytes"
"context"
"flag"
"fmt"
"io"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/build"
"github.com/sourcegraph/zoekt/query"
"github.com/sourcegraph/zoekt/shards"
)

var update = flag.Bool("update", false, "update golden file")

func TestRanking(t *testing.T) {
requireCTags(t)

archiveURLs := []string{
"https://github.com/sourcegraph/sourcegraph/tree/v5.2.2",
}
queries := []string{
"graphql type User",
}

indexDir := t.TempDir()

for _, u := range archiveURLs {
if err := indexURL(indexDir, u); err != nil {
t.Fatal(err)
}
}

ss, err := shards.NewDirectorySearcher(indexDir)
if err != nil {
t.Fatalf("NewDirectorySearcher(%s): %v", indexDir, err)
}
defer ss.Close()

for _, queryStr := range queries {
// normalise queryStr for writing to fs
name := strings.Map(func(r rune) rune {
if strings.ContainsRune(" :", r) {
return '_'
}
if '0' <= r && r <= '9' ||
'a' <= r && r <= 'z' ||
'A' <= r && r <= 'Z' {
return r
}
return -1
}, queryStr)

t.Run(name, func(t *testing.T) {
q, err := query.Parse(queryStr)
if err != nil {
t.Fatal(err)
}

sOpts := zoekt.SearchOptions{
DebugScore: true,
}
result, err := ss.Search(context.Background(), q, &sOpts)
if err != nil {
t.Fatal(err)
}

var gotBuf bytes.Buffer
marshalMatches(&gotBuf, queryStr, q, result.Files)
got := gotBuf.Bytes()

wantPath := filepath.Join("testdata", name+".txt")
if *update {
if err := os.WriteFile(wantPath, got, 0600); err != nil {
t.Fatal(err)
}
}
want, err := os.ReadFile(wantPath)
if err != nil {
t.Fatal(err)
}

if d := cmp.Diff(string(want), string(got)); d != "" {
t.Fatalf("unexpected (-want, +got):\n%s", d)
}
})
}
}

var tarballCache = "/tmp/zoekt-test-ranking-tarballs-" + os.Getenv("USER")

func indexURL(indexDir, u string) error {
if err := os.MkdirAll(tarballCache, 0700); err != nil {
return err
}

opts := Options{
Archive: u,
}
opts.SetDefaults() // sets metadata like Name and the codeload URL
u = opts.Archive

// update Archive location to cached location
cacheBase := fmt.Sprintf("%s-%s%s.tar.gz", url.QueryEscape(opts.Name), opts.Branch, opts.Commit) // assume .tar.gz
path := filepath.Join(tarballCache, cacheBase)
opts.Archive = path

if _, err := os.Stat(path); os.IsNotExist(err) {
if err := download(u, path); err != nil {
return err
}
}

// TODO scip
// languageMap := make(ctags.LanguageMap)
// for _, lang := range []string{"kotlin", "rust", "ruby", "go", "python", "javascript", "c_sharp", "scala", "typescript", "zig"} {
// languageMap[lang] = ctags.ScipCTags
// }

err := do(opts, build.Options{
IndexDir: indexDir,
CTagsMustSucceed: true,
})
if err != nil {
return fmt.Errorf("failed to index %s: %w", opts.Archive, err)
}

return nil
}

func download(url, dst string) error {
tmpPath := dst + ".part"

rc, err := openReader(url)
if err != nil {
return err
}
defer rc.Close()

f, err := os.OpenFile(tmpPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
if err != nil {
return err
}
defer f.Close()

_, err = io.Copy(f, rc)
if err != nil {
return err
}

err = f.Close()
if err != nil {
return err
}

return os.Rename(tmpPath, dst)
}

const (
lineMatchesPerFile = 3
fileMatchesPerSearch = 6
)

func marshalMatches(w io.Writer, queryStr string, q query.Q, files []zoekt.FileMatch) {
_, _ = fmt.Fprintf(w, "queryString: %s\n", queryStr)
_, _ = fmt.Fprintf(w, "query: %s\n\n", q)

files, hiddenFiles := splitAtIndex(files, fileMatchesPerSearch)
for _, f := range files {
_, _ = fmt.Fprintf(w, "%s/%s\t%s\n", f.Repository, f.FileName, f.Debug)

lines, hidden := splitAtIndex(f.LineMatches, lineMatchesPerFile)

for _, m := range lines {
_, _ = fmt.Fprintf(w, "%d:%s\t%s\n", m.LineNumber, m.Line, m.DebugScore)
}

if len(hidden) > 0 {
_, _ = fmt.Fprintf(w, "hidden %d more line matches\n", len(hidden))
}
_, _ = fmt.Fprintln(w)
}

if len(hiddenFiles) > 0 {
fmt.Fprintf(w, "hidden %d more file matches\n", len(hiddenFiles))
}
}

func splitAtIndex[E any](s []E, idx int) ([]E, []E) {
if idx < len(s) {
return s[:idx], s[idx:]
}
return s, nil
}

func requireCTags(tb testing.TB) {
tb.Helper()

if os.Getenv("CTAGS_COMMAND") != "" {
return
}
if _, err := exec.LookPath("universal-ctags"); err == nil {
return
}

// On CI we require ctags to be available. Otherwise we skip
if os.Getenv("CI") != "" {
tb.Fatal("universal-ctags is missing")
} else {
tb.Skip("universal-ctags is missing")
}
}
40 changes: 40 additions & 0 deletions cmd/zoekt-archive-index/testdata/graphql_type_User.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
queryString: graphql type User
query: (and substr:"graphql" substr:"type" case_substr:"User")

github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/schema.graphql score:8807.21 <- atom(4):300.00, fragment:8500.00, doc-order:7.21
6376:type User implements Node & SettingsSubject & Namespace { score:8500.00 <- WordMatch:500.00, Symbol:7000.00, kind:GraphQL:type:1000.00
3862: type: GitRefType score:8050.00 <- WordMatch:500.00, Symbol:7000.00, kind:GraphQL:field:550.00
5037: type: GitRefType! score:8050.00 <- WordMatch:500.00, Symbol:7000.00, kind:GraphQL:field:550.00
hidden 460 more line matches

github.com/sourcegraph/sourcegraph/internal/types/types.go score:8759.73 <- atom(4):300.00, fragment:8450.00, doc-order:9.73
850:type User struct { score:8450.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:struct:950.00
1372: Type *SearchCountStatistics score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
1766: Type string score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
hidden 234 more line matches

github.com/sourcegraph/sourcegraph/client/web/src/enterprise/insights/core/backend/gql-backend/methods/get-dashboard-owners.ts score:8269.38 <- atom(3):266.67, fragment:8000.00, doc-order:2.71
22: type: InsightsDashboardOwnerType.Global, score:8000.00 <- WordMatch:500.00, Symbol:7000.00, kind:TypeScript:constant:500.00
32: type: InsightsDashboardOwnerType.Personal, score:8000.00 <- WordMatch:500.00, Symbol:7000.00, kind:TypeScript:constant:500.00
18: const { currentUser, site } = data score:6500.00 <- WordMatch:500.00, EdgeSymbol:5500.00, kind:TypeScript:constant:500.00
hidden 8 more line matches

github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/apitest/types.go score:8751.64 <- atom(4):300.00, fragment:8450.00, doc-order:1.64
47:type User struct { score:8450.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:struct:950.00
9: Typename string `json:"__typename"` score:6300.00 <- PartialWordMatch:50.00, EdgeSymbol:5500.00, kind:Go:member:750.00
32: Typename string `json:"__typename"` score:6300.00 <- PartialWordMatch:50.00, EdgeSymbol:5500.00, kind:Go:member:750.00
hidden 11 more line matches

github.com/sourcegraph/sourcegraph/cmd/frontend/internal/batches/resolvers/apitest/types.go score:8751.15 <- atom(4):300.00, fragment:8450.00, doc-order:1.15
52:type User struct { score:8450.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:struct:950.00
364: User *User score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
393: Type string score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
hidden 68 more line matches

github.com/sourcegraph/sourcegraph/internal/extsvc/github/common.go score:8725.50 <- atom(3):266.67, fragment:8450.00, doc-order:8.84
2030:type User struct { score:8450.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:struct:950.00
66: User *Actor `json:"User,omitempty"` score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
527: Type string score:8250.00 <- WordMatch:500.00, Symbol:7000.00, kind:Go:member:750.00
hidden 136 more line matches

hidden 743 more file matches

0 comments on commit 9dad685

Please sign in to comment.