Skip to content

Commit

Permalink
shards: selectRepoSet supports queries which are not wrapped in and
Browse files Browse the repository at this point in the history
The previous commit which added support for selectRepoSet in List was
ineffective since the queries we get for List do not look like (and ...)
but instead directly specify the reposet atom.

This commit adds support for it. Additionally to help with our manual
testing we add support for "repo:" query in the selectRepoSet
optimization.

While pairing on this we decided to improve the debug output to tracing
and rename the misnamed query.Q variable in List from r to q.

Test Plan: updated the test case to directly test the sort of queries we
get in List calls. Additionally ran "zoekt-webserver -pprof" and
observed in net/trace output that selectRepoSet optimization reduced the
number of searched shards.

Co-authored-by: Stefan Hengl <[email protected]>
  • Loading branch information
keegancsmith and stefanhengl committed Mar 27, 2024
1 parent 931974d commit 2c00c8e
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 21 deletions.
64 changes: 44 additions & 20 deletions shards/shards.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"os"
"runtime"
"runtime/debug"
"slices"
"sort"
"strconv"
"sync"
Expand Down Expand Up @@ -364,17 +365,30 @@ func (ss *shardedSearcher) Close() {

func selectRepoSet(shards []*rankedShard, q query.Q) ([]*rankedShard, query.Q) {
and, ok := q.(*query.And)
if !ok {
return shards, q
if ok {
return doSelectRepoSet(shards, and)
}

// We have queries which look like (reposet ...) and we want to do the same
// optimizations. To simplify we just always wrap the query in And and then
// on the return value call Simplify to unwrap. In particular this is
// important for List calls.
and = &query.And{Children: []query.Q{q}}
shards, q = doSelectRepoSet(shards, and)
return shards, query.Simplify(q)
}

func doSelectRepoSet(shards []*rankedShard, and *query.And) ([]*rankedShard, query.Q) {
// (and (reposet ...) (q))
// (and true (q)) with a filtered shards
// (and false) // noop

// (and (repobranches ...) (q))
// (and (repobranches ...) (q))

// Note: we also support (and (repo ...) (q)) even though sourcegraph does
// not generate those sorts of queries. This is to support manual testing.

hasReposForPredicate := func(pred func(repo *zoekt.Repository) bool) func(repos []*zoekt.Repository) (any, all bool) {
return func(repos []*zoekt.Repository) (any, all bool) {
any = false
Expand Down Expand Up @@ -402,6 +416,11 @@ func selectRepoSet(shards []*rankedShard, q query.Q) ([]*rankedShard, query.Q) {
hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool {
return setQuery.Repos.Contains(repo.ID)
})
case *query.Repo:
setSize = 0
hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool {
return setQuery.Regexp.MatchString(repo.Name)
})
case *query.BranchesRepos:
for _, br := range setQuery.List {
setSize += int(br.Repos.GetCardinality())
Expand Down Expand Up @@ -447,6 +466,10 @@ func selectRepoSet(shards []*rankedShard, q query.Q) ([]*rankedShard, query.Q) {
return filtered, and
}

// We don't want to mutate the original and, so we clone it before
// mutating it.
and = &query.And{Children: slices.Clone(and.Children)}

// This optimization allows us to avoid the work done by
// indexData.simplify for each shard.
//
Expand All @@ -455,11 +478,7 @@ func selectRepoSet(shards []*rankedShard, q query.Q) ([]*rankedShard, query.Q) {
// shard indexData.simplify will simplify to (and true (content baz)) ->
// (content baz). This work can be done now once, rather than per shard.
switch c := c.(type) {
case *query.RepoSet:
and.Children[i] = &query.Const{Value: true}
return filtered, query.Simplify(and)

case *query.RepoIDs:
case *query.RepoSet, *query.RepoIDs, *query.Repo:
and.Children[i] = &query.Const{Value: true}
return filtered, query.Simplify(and)

Expand Down Expand Up @@ -629,10 +648,13 @@ func streamSearch(ctx context.Context, proc *process, q query.Q, opts *zoekt.Sea
tr.Finish()
}()

tr.LazyPrintf("before selectRepoSet shards:%d", len(shards))
// Select the subset of shards that we will search over for the given query.
shards, q = selectRepoSet(shards, q)
tr.LazyPrintf("after selectRepoSet shards:%d %s", len(shards), q)
{
beforeLen := len(shards)
beforeQ := q
shards, q = selectRepoSet(shards, q)
tr.LazyPrintf("selectRepoSet shards=%d->%d q=%s->%s", beforeLen, len(shards), beforeQ, q)
}

if len(shards) == 0 {
return func() {}, nil
Expand Down Expand Up @@ -888,15 +910,13 @@ func listOneShard(ctx context.Context, s zoekt.Searcher, q query.Q, opts *zoekt.
return s.List(ctx, q, opts)
}

func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
func (ss *shardedSearcher) List(ctx context.Context, q query.Q, opts *zoekt.ListOptions) (rl *zoekt.RepoList, err error) {
tr, ctx := trace.New(ctx, "shardedSearcher.List", "")
metricListRunning.Inc()
defer func() {
metricListRunning.Dec()
if rl != nil {
tr.LazyPrintf("repos size: %d", len(rl.Repos))
tr.LazyPrintf("reposmap size: %d", len(rl.ReposMap))
tr.LazyPrintf("crashes: %d", rl.Crashes)
tr.LazyPrintf("repos.size=%d reposmap.size=%d crashes=%d", len(rl.Repos), len(rl.ReposMap), rl.Crashes)
}
if err != nil {
tr.LazyPrintf("error: %v", err)
Expand All @@ -905,9 +925,9 @@ func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.List
tr.Finish()
}()

r = query.Simplify(r)
q = query.Simplify(q)
isAll := false
if c, ok := r.(*query.Const); ok {
if c, ok := q.(*query.Const); ok {
isAll = c.Value
}

Expand All @@ -919,6 +939,7 @@ func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.List
tr.LazyPrintf("acquired process")

loaded := ss.getLoaded()
shards := loaded.shards

// Setup what we return now, since we may short circuit if there are no
// shards to search.
Expand All @@ -936,9 +957,12 @@ func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.List
// PERF: Select the subset of shards that we will search over for the given
// query. A common List query only asks for a specific repo, so this is an
// important optimization.
tr.LazyPrintf("before selectRepoSet shards:%d", len(loaded.shards))
shards, r := selectRepoSet(loaded.shards, r)
tr.LazyPrintf("after selectRepoSet shards:%d %s", len(shards), r)
{
beforeLen := len(shards)
beforeQ := q
shards, q = selectRepoSet(shards, q)
tr.LazyPrintf("selectRepoSet shards=%d->%d q=%s->%s", beforeLen, len(shards), beforeQ, q)
}

if len(shards) == 0 {
return &agg, nil
Expand Down Expand Up @@ -977,7 +1001,7 @@ func (ss *shardedSearcher) List(ctx context.Context, r query.Q, opts *zoekt.List
for range workers {
g.Go(func() error {
for s := range feeder {
result, err := listOneShard(ctx, s, r, opts)
result, err := listOneShard(ctx, s, q, opts)
if err != nil {
return err
}
Expand Down
20 changes: 19 additions & 1 deletion shards/shards_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,16 @@ func TestShardedSearcher_DocumentRanking(t *testing.T) {
func TestFilteringShardsByRepoSetOrBranchesReposOrRepoIDs(t *testing.T) {
ss := newShardedSearcher(1)

// namePrefix is so we can create a repo:foo filter and match the same set
// of repos.
namePrefix := [3]string{"foo", "bar", "baz"}

repoSetNames := []string{}
repoIDs := []uint32{}
n := 10 * runtime.GOMAXPROCS(0)
for i := 0; i < n; i++ {
shardName := fmt.Sprintf("shard%d", i)
repoName := fmt.Sprintf("repository%.3d", i)
repoName := fmt.Sprintf("%s-repository%.3d", namePrefix[i%3], i)
repoID := hash(repoName)

if i%3 == 0 {
Expand Down Expand Up @@ -329,6 +333,7 @@ func TestFilteringShardsByRepoSetOrBranchesReposOrRepoIDs(t *testing.T) {
sub := &query.Substring{Pattern: "bla"}

repoIDsQuery := query.NewRepoIDs(repoIDs...)
repoQuery := &query.Repo{Regexp: regexp.MustCompile("^foo-.*")}

queries := []query.Q{
query.NewAnd(set, sub),
Expand All @@ -342,6 +347,19 @@ func TestFilteringShardsByRepoSetOrBranchesReposOrRepoIDs(t *testing.T) {
query.NewAnd(repoIDsQuery, sub),
// Test with the same repoIDs again
query.NewAnd(repoIDsQuery, sub),

query.NewAnd(repoQuery, sub),
query.NewAnd(repoQuery, sub),

// List has queries which are just the reposet atoms. We also test twice.
set,
set,
branchesRepos,
branchesRepos,
repoIDsQuery,
repoIDsQuery,
repoQuery,
repoQuery,
}

for _, q := range queries {
Expand Down

0 comments on commit 2c00c8e

Please sign in to comment.