matchtree: call prepare on symbolRegexpMatchTree subtree (#685)

This was a huge oversight that has lived in our codebase since we introduced symbolRegexpMatchTree. Because we don't call prepare, we don't correctly use the index for symbol regex queries. From some local testing this makes a huge difference to performance. Huge shout-out to @camdencheek who spotted this. Test Plan: validated with some local searches that results remain the same and that the statistics for the searches go up for IndexBytesLoaded, but go down for ContentBytesLoaded, FilesConsidered, FilesLoaded, etc. Added unit tests which assert the index is used. Also perf tested with hyperfine. Hyperfine results: Benchmark 1: ./zoekt-before -sym '^searcher$' Time (mean ± σ): 93.0 ms ± 1.2 ms [User: 142.2 ms, System: 18.9 ms] Range (min … max): 90.8 ms … 95.6 ms 31 runs Benchmark 2: ./zoekt-after -sym '^searcher$' Time (mean ± σ): 52.3 ms ± 0.5 ms [User: 76.3 ms, System: 13.0 ms] Range (min … max): 50.7 ms … 53.4 ms 53 runs Summary './zoekt-after -sym '^searcher$'' ran 1.78 ± 0.03 times faster than './zoekt-before -sym '^searcher$'' For that search, a random comparison of the zoekt stats: | Stat | Before | After | Delta | |---------------------- |---------- |--------- |----------- | | ContentBytesLoaded | 199007382 | 22566033 | -176441349 | | IndexBytesLoaded | 3527 | 165645 | 162118 | | Crashes | 0 | 0 | 0 | | Duration | 57956167 | 17568708 | -40387459 | | FileCount | 28 | 28 | 0 | | ShardFilesConsidered | 0 | 0 | 0 | | FilesConsidered | 28477 | 766 | -27711 | | FilesLoaded | 28477 | 766 | -27711 | | FilesSkipped | 0 | 0 | 0 | | ShardsScanned | 5 | 5 | 0 | | ShardsSkipped | 0 | 0 | 0 | | ShardsSkippedFilter | 0 | 0 | 0 | | MatchCount | 29 | 29 | 0 | | NgramMatches | 87 | 4407 | 4320 | | NgramLookups | 644 | 644 | 0 | | Wait | 5792 | 11500 | 5708 | | MatchTreeConstruction | 498042 | 515248 | 17206 | | MatchTreeSearch | 97661875 | 23089418 | -74572457 | Analysis: An absolutely massive reduction in the number of files we consider. This means we are actually using the index properly. eg look at ContentBytesLoaded, Duration, FilesConsidered, FilesLoaded. You can also see that IndexBytesLoaded has gone up since we now use it properly. This was on a small corpus so will have huge impact in production. Note that the random changes Wait, MatchTreeConstruction are random, but the MatchTreeSearch change is a big deal since that is time spent searching after analysing a query.
sourcegraph · Nov 9, 2023 · db067d1 · db067d1
1 parent bec12a7
commit db067d1
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 12 deletions.
diff --git a/cmd/zoekt/main.go b/cmd/zoekt/main.go
@@ -130,6 +130,24 @@ func startFullProfile(path string, duration time.Duration) func() bool {
 	})
 }
 
+// experimental support for symbol queries. We just convert substring queries
+// into symbol queries. Needs to run after query.ExpandFileContent
+func toSymbolQuery(q query.Q) query.Q {
+	return query.Map(q, func(q query.Q) query.Q {
+		switch s := q.(type) {
+		case *query.Substring:
+			if s.Content {
+				return &query.Symbol{Expr: s}
+			}
+		case *query.Regexp:
+			if s.Content {
+				return &query.Symbol{Expr: s}
+			}
+		}
+		return q
+	})
+}
+
 func main() {
 	shard := flag.String("shard", "", "search in a specific shard")
 	index := flag.String("index_dir",
@@ -141,6 +159,7 @@ func main() {
 	verbose := flag.Bool("v", false, "print some background data")
 	withRepo := flag.Bool("r", false, "print the repo before the file name")
 	list := flag.Bool("l", false, "print matching filenames only")
+	sym := flag.Bool("sym", false, "do experimental symbol search")
 
 	flag.Usage = func() {
 		name := os.Args[0]
@@ -170,29 +189,34 @@ func main() {
 		log.Fatal(err)
 	}
 
-	query, err := query.Parse(pat)
+	q, err := query.Parse(pat)
 	if err != nil {
 		log.Fatal(err)
 	}
+	q = query.Map(q, query.ExpandFileContent)
+	if *sym {
+		q = toSymbolQuery(q)
+	}
+	q = query.Simplify(q)
 	if *verbose {
-		log.Println("query:", query)
+		log.Println("query:", q)
 	}
 
 	sOpts := zoekt.SearchOptions{
 		DebugScore: *debug,
 	}
-	sres, err := searcher.Search(context.Background(), query, &sOpts)
+	sres, err := searcher.Search(context.Background(), q, &sOpts)
 	if err != nil {
 		log.Fatal(err)
 	}
 
 	// If profiling, do it another time so we measure with
 	// warm caches.
 	for run := startCPUProfile(*cpuProfile, *profileTime); run(); {
-		sres, _ = searcher.Search(context.Background(), query, &sOpts)
+		sres, _ = searcher.Search(context.Background(), q, &sOpts)
 	}
 	for run := startFullProfile(*fullProfile, *profileTime); run(); {
-		sres, _ = searcher.Search(context.Background(), query, &sOpts)
+		sres, _ = searcher.Search(context.Background(), q, &sOpts)
 	}
 
 	displayMatches(sres.Files, pat, *withRepo, *list)

diff --git a/index_test.go b/index_test.go
@@ -369,13 +369,28 @@ func TestCaseFold(t *testing.T) {
 	})
 }
 
+// wordsAsSymbols finds all ASCII words in doc.Content which are longer than 2
+// chars. Those are then set as symbols.
+func wordsAsSymbols(doc Document) Document {
+	re := regexp.MustCompile(`\b\w{2,}\b`)
+	var symbols []DocumentSection
+	for _, match := range re.FindAllIndex(doc.Content, -1) {
+		symbols = append(symbols, DocumentSection{
+			Start: uint32(match[0]),
+			End:   uint32(match[1]),
+		})
+	}
+	doc.Symbols = symbols
+	return doc
+}
+
 func TestSearchStats(t *testing.T) {
 	ctx := context.Background()
 	searcher := searcherForTest(t, testIndexBuilder(t, nil,
-		Document{Name: "f1", Content: []byte("x banana y")},
-		Document{Name: "f2", Content: []byte("x apple y")},
-		Document{Name: "f3", Content: []byte("x banana apple y")},
-		// -----------------------------------0123456789012345
+		wordsAsSymbols(Document{Name: "f1", Content: []byte("x banana y")}),
+		wordsAsSymbols(Document{Name: "f2", Content: []byte("x apple y")}),
+		wordsAsSymbols(Document{Name: "f3", Content: []byte("x banana apple y")}),
+		// --------------------------------------------------0123456789012345
 	))
 
 	andQuery := query.NewAnd(
@@ -425,7 +440,7 @@ func TestSearchStats(t *testing.T) {
 			Q:    andQuery,
 			Want: Stats{
 				FilesLoaded:        1,
-				ContentBytesLoaded: 18,
+				ContentBytesLoaded: 22,
 				IndexBytesLoaded:   8,
 				NgramMatches:       3, // we look at doc 1, because it's max(0,1) due to AND
 				NgramLookups:       104,
@@ -442,7 +457,7 @@ func TestSearchStats(t *testing.T) {
 				CaseSensitive: true,
 			},
 			Want: Stats{
-				ContentBytesLoaded: 12,
+				ContentBytesLoaded: 14,
 				IndexBytesLoaded:   1,
 				FileCount:          1,
 				FilesConsidered:    1,
@@ -459,7 +474,7 @@ func TestSearchStats(t *testing.T) {
 				Content: true,
 			},
 			Want: Stats{
-				ContentBytesLoaded: 12,
+				ContentBytesLoaded: 14,
 				IndexBytesLoaded:   1,
 				FileCount:          1,
 				FilesConsidered:    1,
@@ -499,6 +514,74 @@ func TestSearchStats(t *testing.T) {
 				ShardsSkippedFilter: 1,
 				NgramLookups:        3, // we lookedup "foo" once (1), but lookedup and created "a y" (2).
 			},
+		}, {
+			Name: "symbol-substr-nomatch",
+			Q: &query.Symbol{Expr: &query.Substring{
+				Pattern:       "banana apple",
+				Content:       true,
+				CaseSensitive: true,
+			}},
+			Want: Stats{
+				IndexBytesLoaded: 3,
+				FilesConsidered:  1, // important that we only check 1 file to ensure we are using the index
+				MatchCount:       0, // even though there is a match it doesn't align with a symbol
+				ShardsScanned:    1,
+				NgramMatches:     1,
+				NgramLookups:     12,
+			},
+		}, {
+			Name: "symbol-substr",
+			Q: &query.Symbol{Expr: &query.Substring{
+				Pattern:       "apple",
+				Content:       true,
+				CaseSensitive: true,
+			}},
+			Want: Stats{
+				ContentBytesLoaded: 35,
+				IndexBytesLoaded:   4,
+				FileCount:          2,
+				FilesConsidered:    2, // must be 2 to ensure we used the index
+				FilesLoaded:        2,
+				MatchCount:         2, // apple symbols is in two files
+				ShardsScanned:      1,
+				NgramMatches:       2,
+				NgramLookups:       5,
+			},
+		}, {
+			Name: "symbol-regexp-nomatch",
+			Q: &query.Symbol{Expr: &query.Regexp{
+				Regexp:        mustParseRE("^apple.banana$"),
+				Content:       true,
+				CaseSensitive: true,
+			}},
+			Want: Stats{
+				ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents
+				IndexBytesLoaded:   8,
+				FilesConsidered:    2, // important that we don't check 3 to ensure we are using the index
+				FilesLoaded:        2,
+				MatchCount:         0, // even though there is a match it doesn't align with a symbol
+				ShardsScanned:      1,
+				NgramMatches:       3,
+				NgramLookups:       11,
+			},
+		}, {
+			Name: "symbol-regexp",
+			Q: &query.Symbol{Expr: &query.Regexp{
+				Regexp:        mustParseRE("^app.e$"),
+				Content:       true,
+				CaseSensitive: true,
+			}},
+			Want: Stats{
+				ContentBytesLoaded: 35,
+				IndexBytesLoaded:   2,
+				FileCount:          2,
+				FilesConsidered:    2, // must be 2 to ensure we used the index
+				FilesLoaded:        2,
+				MatchCount:         2, // apple symbols is in two files
+				ShardsScanned:      1,
+				NgramMatches:       2,
+				NgramLookups:       2,
+			},
 		}}
 
 		for _, tc := range cases {

diff --git a/matchtree.go b/matchtree.go
@@ -196,6 +196,8 @@ type symbolRegexpMatchTree struct {
 
 func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 	t.reEvaluated = false
+	t.found = t.found[:0]
+	t.matchTree.prepare(doc)
 }
 
 func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {