Skip to content

Commit

Permalink
matchtree: call prepare on symbolRegexpMatchTree subtree (#685)
Browse files Browse the repository at this point in the history
This was a huge oversight that has lived in our codebase since we
introduced symbolRegexpMatchTree. Because we don't call prepare, we
don't correctly use the index for symbol regex queries. From some local
testing this makes a huge difference to performance.

Huge shout-out to @camdencheek who spotted this.

Test Plan: validated with some local searches that results remain the same and
that the statistics for the searches go up for IndexBytesLoaded, but go down
for ContentBytesLoaded, FilesConsidered, FilesLoaded, etc. Added unit tests
which assert the index is used. Also perf tested with hyperfine.

Hyperfine results:

  Benchmark 1: ./zoekt-before -sym '^searcher$'
    Time (mean ± σ):      93.0 ms ±   1.2 ms    [User: 142.2 ms, System: 18.9 ms]
    Range (min … max):    90.8 ms …  95.6 ms    31 runs

  Benchmark 2: ./zoekt-after -sym '^searcher$'
    Time (mean ± σ):      52.3 ms ±   0.5 ms    [User: 76.3 ms, System: 13.0 ms]
    Range (min … max):    50.7 ms …  53.4 ms    53 runs

  Summary
    './zoekt-after -sym '^searcher$'' ran
      1.78 ± 0.03 times faster than './zoekt-before -sym '^searcher$''

For that search, a random comparison of the zoekt stats:

| Stat                  | Before    | After    | Delta      |
|---------------------- |---------- |--------- |----------- |
| ContentBytesLoaded    | 199007382 | 22566033 | -176441349 |
| IndexBytesLoaded      | 3527      | 165645   | 162118     |
| Crashes               | 0         | 0        | 0          |
| Duration              | 57956167  | 17568708 | -40387459  |
| FileCount             | 28        | 28       | 0          |
| ShardFilesConsidered  | 0         | 0        | 0          |
| FilesConsidered       | 28477     | 766      | -27711     |
| FilesLoaded           | 28477     | 766      | -27711     |
| FilesSkipped          | 0         | 0        | 0          |
| ShardsScanned         | 5         | 5        | 0          |
| ShardsSkipped         | 0         | 0        | 0          |
| ShardsSkippedFilter   | 0         | 0        | 0          |
| MatchCount            | 29        | 29       | 0          |
| NgramMatches          | 87        | 4407     | 4320       |
| NgramLookups          | 644       | 644      | 0          |
| Wait                  | 5792      | 11500    | 5708       |
| MatchTreeConstruction | 498042    | 515248   | 17206      |
| MatchTreeSearch       | 97661875  | 23089418 | -74572457  |

Analysis: An absolutely massive reduction in the number of files we consider.
This means we are actually using the index properly. eg look at
ContentBytesLoaded, Duration, FilesConsidered, FilesLoaded. You can also see
that IndexBytesLoaded has gone up since we now use it properly. This was on a
small corpus so will have huge impact in production.

Note that the random changes Wait, MatchTreeConstruction are random, but the
MatchTreeSearch change is a big deal since that is time spent searching after
analysing a query.
  • Loading branch information
keegancsmith authored Nov 9, 2023
1 parent bec12a7 commit db067d1
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 12 deletions.
34 changes: 29 additions & 5 deletions cmd/zoekt/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,24 @@ func startFullProfile(path string, duration time.Duration) func() bool {
})
}

// experimental support for symbol queries. We just convert substring queries
// into symbol queries. Needs to run after query.ExpandFileContent
func toSymbolQuery(q query.Q) query.Q {
return query.Map(q, func(q query.Q) query.Q {
switch s := q.(type) {
case *query.Substring:
if s.Content {
return &query.Symbol{Expr: s}
}
case *query.Regexp:
if s.Content {
return &query.Symbol{Expr: s}
}
}
return q
})
}

func main() {
shard := flag.String("shard", "", "search in a specific shard")
index := flag.String("index_dir",
Expand All @@ -141,6 +159,7 @@ func main() {
verbose := flag.Bool("v", false, "print some background data")
withRepo := flag.Bool("r", false, "print the repo before the file name")
list := flag.Bool("l", false, "print matching filenames only")
sym := flag.Bool("sym", false, "do experimental symbol search")

flag.Usage = func() {
name := os.Args[0]
Expand Down Expand Up @@ -170,29 +189,34 @@ func main() {
log.Fatal(err)
}

query, err := query.Parse(pat)
q, err := query.Parse(pat)
if err != nil {
log.Fatal(err)
}
q = query.Map(q, query.ExpandFileContent)
if *sym {
q = toSymbolQuery(q)
}
q = query.Simplify(q)
if *verbose {
log.Println("query:", query)
log.Println("query:", q)
}

sOpts := zoekt.SearchOptions{
DebugScore: *debug,
}
sres, err := searcher.Search(context.Background(), query, &sOpts)
sres, err := searcher.Search(context.Background(), q, &sOpts)
if err != nil {
log.Fatal(err)
}

// If profiling, do it another time so we measure with
// warm caches.
for run := startCPUProfile(*cpuProfile, *profileTime); run(); {
sres, _ = searcher.Search(context.Background(), query, &sOpts)
sres, _ = searcher.Search(context.Background(), q, &sOpts)
}
for run := startFullProfile(*fullProfile, *profileTime); run(); {
sres, _ = searcher.Search(context.Background(), query, &sOpts)
sres, _ = searcher.Search(context.Background(), q, &sOpts)
}

displayMatches(sres.Files, pat, *withRepo, *list)
Expand Down
97 changes: 90 additions & 7 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,13 +369,28 @@ func TestCaseFold(t *testing.T) {
})
}

// wordsAsSymbols finds all ASCII words in doc.Content which are longer than 2
// chars. Those are then set as symbols.
func wordsAsSymbols(doc Document) Document {
re := regexp.MustCompile(`\b\w{2,}\b`)
var symbols []DocumentSection
for _, match := range re.FindAllIndex(doc.Content, -1) {
symbols = append(symbols, DocumentSection{
Start: uint32(match[0]),
End: uint32(match[1]),
})
}
doc.Symbols = symbols
return doc
}

func TestSearchStats(t *testing.T) {
ctx := context.Background()
searcher := searcherForTest(t, testIndexBuilder(t, nil,
Document{Name: "f1", Content: []byte("x banana y")},
Document{Name: "f2", Content: []byte("x apple y")},
Document{Name: "f3", Content: []byte("x banana apple y")},
// -----------------------------------0123456789012345
wordsAsSymbols(Document{Name: "f1", Content: []byte("x banana y")}),
wordsAsSymbols(Document{Name: "f2", Content: []byte("x apple y")}),
wordsAsSymbols(Document{Name: "f3", Content: []byte("x banana apple y")}),
// --------------------------------------------------0123456789012345
))

andQuery := query.NewAnd(
Expand Down Expand Up @@ -425,7 +440,7 @@ func TestSearchStats(t *testing.T) {
Q: andQuery,
Want: Stats{
FilesLoaded: 1,
ContentBytesLoaded: 18,
ContentBytesLoaded: 22,
IndexBytesLoaded: 8,
NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND
NgramLookups: 104,
Expand All @@ -442,7 +457,7 @@ func TestSearchStats(t *testing.T) {
CaseSensitive: true,
},
Want: Stats{
ContentBytesLoaded: 12,
ContentBytesLoaded: 14,
IndexBytesLoaded: 1,
FileCount: 1,
FilesConsidered: 1,
Expand All @@ -459,7 +474,7 @@ func TestSearchStats(t *testing.T) {
Content: true,
},
Want: Stats{
ContentBytesLoaded: 12,
ContentBytesLoaded: 14,
IndexBytesLoaded: 1,
FileCount: 1,
FilesConsidered: 1,
Expand Down Expand Up @@ -499,6 +514,74 @@ func TestSearchStats(t *testing.T) {
ShardsSkippedFilter: 1,
NgramLookups: 3, // we lookedup "foo" once (1), but lookedup and created "a y" (2).
},
}, {
Name: "symbol-substr-nomatch",
Q: &query.Symbol{Expr: &query.Substring{
Pattern: "banana apple",
Content: true,
CaseSensitive: true,
}},
Want: Stats{
IndexBytesLoaded: 3,
FilesConsidered: 1, // important that we only check 1 file to ensure we are using the index
MatchCount: 0, // even though there is a match it doesn't align with a symbol
ShardsScanned: 1,
NgramMatches: 1,
NgramLookups: 12,
},
}, {
Name: "symbol-substr",
Q: &query.Symbol{Expr: &query.Substring{
Pattern: "apple",
Content: true,
CaseSensitive: true,
}},
Want: Stats{
ContentBytesLoaded: 35,
IndexBytesLoaded: 4,
FileCount: 2,
FilesConsidered: 2, // must be 2 to ensure we used the index
FilesLoaded: 2,
MatchCount: 2, // apple symbols is in two files
ShardsScanned: 1,
NgramMatches: 2,
NgramLookups: 5,
},
}, {
Name: "symbol-regexp-nomatch",
Q: &query.Symbol{Expr: &query.Regexp{
Regexp: mustParseRE("^apple.banana$"),
Content: true,
CaseSensitive: true,
}},
Want: Stats{
ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents
IndexBytesLoaded: 8,
FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index
FilesLoaded: 2,
MatchCount: 0, // even though there is a match it doesn't align with a symbol
ShardsScanned: 1,
NgramMatches: 3,
NgramLookups: 11,
},
}, {
Name: "symbol-regexp",
Q: &query.Symbol{Expr: &query.Regexp{
Regexp: mustParseRE("^app.e$"),
Content: true,
CaseSensitive: true,
}},
Want: Stats{
ContentBytesLoaded: 35,
IndexBytesLoaded: 2,
FileCount: 2,
FilesConsidered: 2, // must be 2 to ensure we used the index
FilesLoaded: 2,
MatchCount: 2, // apple symbols is in two files
ShardsScanned: 1,
NgramMatches: 2,
NgramLookups: 2,
},
}}

for _, tc := range cases {
Expand Down
2 changes: 2 additions & 0 deletions matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ type symbolRegexpMatchTree struct {

func (t *symbolRegexpMatchTree) prepare(doc uint32) {
t.reEvaluated = false
t.found = t.found[:0]
t.matchTree.prepare(doc)
}

func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
Expand Down

0 comments on commit db067d1

Please sign in to comment.