Skip to content

Commit

Permalink
matchtree: do not depend on String for symbol all optimization
Browse files Browse the repository at this point in the history
In go 1.22 the way go converts a query like .* into a String changed.
This meant that when we compiled zoekt in the sourcegraph codebase our
"regex matches all" optimization stopped working.

This commit moves us to a simple inspection of the syntax tree which
will be stable across versions and to be honest just feels better.

To do this we introduce a field "origRegexp" to the regexpMatchTree so
we can avoid needing to recompile the syntax tree. Note: Regexp does not
offer a way to get at its internal syntax.Regexp which would be nice.

Test Plan: go test and added a unit test
  • Loading branch information
keegancsmith committed Mar 12, 2024
1 parent d94b6fe commit f0b55a2
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 7 deletions.
42 changes: 35 additions & 7 deletions matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ type noVisitMatchTree struct {
type regexpMatchTree struct {
regexp *regexp.Regexp

// origRegexp is the original parsed regexp from the query structure. It
// does not include mutations such as case sensitivity.
origRegexp *syntax.Regexp

fileName bool

// mutable
Expand All @@ -200,8 +204,9 @@ func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
}

return &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
fileName: s.FileName,
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
origRegexp: s.Regexp,
fileName: s.FileName,
}
}

Expand Down Expand Up @@ -1107,19 +1112,19 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
}, nil
}

var regexp *regexp.Regexp
var regexpMT *regexpMatchTree
visitMatchTree(subMT, func(mt matchTree) {
if t, ok := mt.(*regexpMatchTree); ok {
regexp = t.regexp
regexpMT = t
}
})
if regexp == nil {
if regexpMT == nil {
return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
}

return &symbolRegexpMatchTree{
regexp: regexp,
all: regexp.String() == "(?i)(?-s:.*)",
regexp: regexpMT.regexp,
all: isRegexpAll(regexpMT.origRegexp),
matchTree: subMT,
}, nil

Expand Down Expand Up @@ -1376,3 +1381,26 @@ func pruneMatchTree(mt matchTree) (matchTree, error) {
}
return mt, err
}

// isRegexpAll returns true if the query matches all possible lines.
//
// Note: it is possible for a funky regex to actually match all but this
// returns false. This returns true for normal looking regexes like ".*" or
// "(?-s:.*)".
func isRegexpAll(r *syntax.Regexp) bool {
// Note: we don't care about any flags since we are looking for .* and we
// don't mind if . matches all or everything but newline.

// Our main target: .*
if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
// (?s:.) or (?-s:.)
return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
}

// Strip away expressions being wrapped in paranthesis
if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
return isRegexpAll(r.Sub[0])
}

return false
}
36 changes: 36 additions & 0 deletions matchtree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package zoekt

import (
"reflect"
"regexp/syntax"
"testing"

"github.com/RoaringBitmap/roaring"
Expand Down Expand Up @@ -385,3 +386,38 @@ func TestRepoIDs(t *testing.T) {
t.Fatalf("expected %d document, but got at least 1 more", len(want))
}
}

func TestIsRegexpAll(t *testing.T) {
valid := []string{
".*",
"(.*)",
"(?-s:.*)",
"(?s:.*)",
"(?i)(?-s:.*)",
}
invalid := []string{
".",
"foo",
"(foo.*)",
}

for _, s := range valid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if !isRegexpAll(r) {
t.Errorf("expected %q to match all", s)
}
}

for _, s := range invalid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if isRegexpAll(r) {
t.Errorf("expected %q to not match all", s)
}
}
}

0 comments on commit f0b55a2

Please sign in to comment.