From c39011a14191514737c52ab3f0e06348915c087e Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 2 Apr 2024 09:12:38 +0200 Subject: [PATCH] all: use a faster vendored regexp/syntax/Regexp.String (#753) We replace all calls to Regexp.String with a vendored version which is faster. go1.22 introduced a commit which "minimizes" the string returned by Regexp.String(). Part of what it does is run enumerate through literals runes in your string to see calculate flags related to unicode and case sensitivity. This can be quite slow, but is made worse by the fact we call it per shard per regexp in your query.Q to construct the matchtree. Currently Regexp.String() represents 40% of CPU time on sourcegraph.com. Before go1.22 it was ~0%. Note: This is a temporary change to resolve the issue. I have a deeper change to make this less clumsy. Note: In one place we remove the use of string by relying on Regexp.Equal instead. Test Plan: go test --- internal/syntaxutil/README.md | 58 +++++ internal/syntaxutil/alias_test.go | 51 ++++ internal/syntaxutil/parse_test.go | 397 ++++++++++++++++++++++++++++++ internal/syntaxutil/regexp.go | 192 +++++++++++++++ matchtree.go | 3 +- matchtree_test.go | 4 +- query/query.go | 7 +- query/regexp.go | 6 +- query/regexp_test.go | 8 +- 9 files changed, 715 insertions(+), 11 deletions(-) create mode 100644 internal/syntaxutil/README.md create mode 100644 internal/syntaxutil/alias_test.go create mode 100644 internal/syntaxutil/parse_test.go create mode 100644 internal/syntaxutil/regexp.go diff --git a/internal/syntaxutil/README.md b/internal/syntaxutil/README.md new file mode 100644 index 000000000..2b551db32 --- /dev/null +++ b/internal/syntaxutil/README.md @@ -0,0 +1,58 @@ +# vendored std regexp/syntax + +This package contains a vendored copy of std regexp/syntax. However, it only +contains the code for converting syntax.Regexp into a String. It is the +version of the code at a recent go commit, but with a commit which introduces +a significant performance regression reverted. + +At the time of writing regexp.String on go1.22 is taking 40% of CPU at +Sourcegraph. This should return to ~0% with this vendored code. + +https://github.com/sourcegraph/sourcegraph/issues/61462 + +## Vendored commit + +``` +commit 2e1003e2f7e42efc5771812b9ee6ed264803796c +Author: Daniel Martí +Date: Tue Mar 26 22:59:41 2024 +0200 + + cmd/go: replace reflect.DeepEqual with slices.Equal and maps.Equal + + All of these maps and slices are made up of comparable types, + so we can avoid the overhead of reflection entirely. + + Change-Id: If77dbe648a336ba729c171e84c9ff3f7e160297d + Reviewed-on: https://go-review.googlesource.com/c/go/+/574597 + Reviewed-by: Than McIntosh + LUCI-TryBot-Result: Go LUCI + Reviewed-by: Ian Lance Taylor +``` + +## Reverted commit + +``` +commit 98c9f271d67b501ecf2ce995539abd2cdc81d505 +Author: Russ Cox +Date: Wed Jun 28 17:45:26 2023 -0400 + + regexp/syntax: use more compact Regexp.String output + + Compact the Regexp.String output. It was only ever intended for debugging, + but there are at least some uses in the wild where regexps are built up + using regexp/syntax and then formatted using the String method. + Compact the output to help that use case. Specifically: + + - Compact 2-element character class ranges: [a-b] -> [ab]. + - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). + + Fixes #57950. + + Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb + Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 + TryBot-Result: Gopher Robot + Reviewed-by: Than McIntosh + Run-TryBot: Russ Cox + Reviewed-by: Rob Pike + Auto-Submit: Russ Cox +``` diff --git a/internal/syntaxutil/alias_test.go b/internal/syntaxutil/alias_test.go new file mode 100644 index 000000000..c2a5d6a08 --- /dev/null +++ b/internal/syntaxutil/alias_test.go @@ -0,0 +1,51 @@ +package syntaxutil + +import "regexp/syntax" + +// A bunch of aliases to avoid needing to modify parse_test.go too much. + +type Regexp = syntax.Regexp + +type Op = syntax.Op + +const ( + OpNoMatch = syntax.OpNoMatch + OpEmptyMatch = syntax.OpEmptyMatch + OpLiteral = syntax.OpLiteral + OpCharClass = syntax.OpCharClass + OpAnyCharNotNL = syntax.OpAnyCharNotNL + OpAnyChar = syntax.OpAnyChar + OpBeginLine = syntax.OpBeginLine + OpEndLine = syntax.OpEndLine + OpBeginText = syntax.OpBeginText + OpEndText = syntax.OpEndText + OpWordBoundary = syntax.OpWordBoundary + OpNoWordBoundary = syntax.OpNoWordBoundary + OpCapture = syntax.OpCapture + OpStar = syntax.OpStar + OpPlus = syntax.OpPlus + OpQuest = syntax.OpQuest + OpRepeat = syntax.OpRepeat + OpConcat = syntax.OpConcat + OpAlternate = syntax.OpAlternate +) + +type Flags = syntax.Flags + +const ( + FoldCase = syntax.FoldCase + Literal = syntax.Literal + ClassNL = syntax.ClassNL + DotNL = syntax.DotNL + OneLine = syntax.OneLine + NonGreedy = syntax.NonGreedy + PerlX = syntax.PerlX + UnicodeGroups = syntax.UnicodeGroups + WasDollar = syntax.WasDollar + Simple = syntax.Simple + MatchNL = syntax.MatchNL + Perl = syntax.Perl + POSIX = syntax.POSIX +) + +var Parse = syntax.Parse diff --git a/internal/syntaxutil/parse_test.go b/internal/syntaxutil/parse_test.go new file mode 100644 index 000000000..23b8a916f --- /dev/null +++ b/internal/syntaxutil/parse_test.go @@ -0,0 +1,397 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntaxutil + +import ( + "fmt" + "strings" + "testing" + "unicode" +) + +type parseTest struct { + Regexp string + Dump string +} + +var parseTests = []parseTest{ + // Base cases + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + // Malformed { } are treated as literals. + {`x{1001`, `str{x{1001}`}, + {`x{9876543210`, `str{x{9876543210}`}, + {`x{9876543210,`, `str{x{9876543210,}`}, + {`x{2,1`, `str{x{2,1}`}, + {`x{1,9876543210`, `str{x{1,9876543210}`}, + {``, `emp{}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, + + // Posix and Perl extensions + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never + + // Unicode, negatives, and a double negative. + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, + {`\p{Any}`, `dot{}`}, + {`\p{^Any}`, `cc{}`}, + + // Hex, octal. + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, + + // More interesting regular expressions. + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, + + // Test precedences + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, + + // Test flattening. + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, + + // Test Perl quoted literals + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, + + // Test Perl \A and \z + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, + + // Test named captures + {`(?Pa)`, `cap{name:lit{a}}`}, + {`(?a)`, `cap{name:lit{a}}`}, + + // Case-folded literals + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, + + // Strings + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, + + // Bug fixes. + {`(?:.)`, `dot{}`}, + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, + {`(?:A|a)`, `litfold{A}`}, + {`A|(?:A|a)`, `litfold{A}`}, + {`(?s).`, `dot{}`}, + {`(?-s).`, `dnl{}`}, + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, + {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`}, + + // RE2 prefix_tests + {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`abc|abd|aef|bcx|bcy`, + `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + + `cat{str{bc}cc{0x78-0x79}}}`}, + {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, + {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, + {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, + {`x{2}|x{2}[0-9]`, + `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, + {`x{2}y|x{2}[0-9]y`, + `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + {`a.*?c|a.*?b`, + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, + + // Valid repetitions. + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, + + // Valid nesting. + {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``}, + {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``}, + {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b strings.Builder + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *strings.Builder, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.SimpleFold(r) != r { + b.WriteString("fold") + break + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} + +func mkCharClass(f func(rune) bool) string { + re := &Regexp{Op: OpCharClass} + lo := rune(-1) + for i := rune(0); i <= unicode.MaxRune; i++ { + if f(i) { + if lo < 0 { + lo = i + } + } else { + if lo >= 0 { + re.Rune = append(re.Rune, lo, i-1) + lo = -1 + } + } + } + if lo >= 0 { + re.Rune = append(re.Rune, lo, unicode.MaxRune) + } + return dump(re) +} + +func isUpperFold(r rune) bool { + if unicode.IsUpper(r) { + return true + } + c := unicode.SimpleFold(r) + for c != r { + if unicode.IsUpper(c) { + return true + } + c = unicode.SimpleFold(c) + } + return false +} + +func TestToStringEquivalentParse(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + continue + } + + s := re.String() + if s != tt.Regexp { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent in some contexts. + nre, err := Parse(s, testFlags) + if err != nil { + t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) + continue + } + nd := dump(nre) + if d != nd { + t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) + } + + ns := nre.String() + if s != ns { + t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) + } + } + } +} diff --git a/internal/syntaxutil/regexp.go b/internal/syntaxutil/regexp.go new file mode 100644 index 000000000..cbbe616c6 --- /dev/null +++ b/internal/syntaxutil/regexp.go @@ -0,0 +1,192 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntaxutil + +import ( + "regexp/syntax" + "strconv" + "strings" + "unicode" +) + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *strings.Builder, re *syntax.Regexp) { + switch re.Op { + default: + b.WriteString("") + case syntax.OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case syntax.OpEmptyMatch: + b.WriteString(`(?:)`) + case syntax.OpLiteral: + if re.Flags&syntax.FoldCase != 0 { + b.WriteString(`(?i:`) + } + for _, r := range re.Rune { + escape(b, r, false) + } + if re.Flags&syntax.FoldCase != 0 { + b.WriteString(`)`) + } + case syntax.OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case syntax.OpAnyCharNotNL: + b.WriteString(`(?-s:.)`) + case syntax.OpAnyChar: + b.WriteString(`(?s:.)`) + case syntax.OpBeginLine: + b.WriteString(`(?m:^)`) + case syntax.OpEndLine: + b.WriteString(`(?m:$)`) + case syntax.OpBeginText: + b.WriteString(`\A`) + case syntax.OpEndText: + if re.Flags&syntax.WasDollar != 0 { + b.WriteString(`(?-m:$)`) + } else { + b.WriteString(`\z`) + } + case syntax.OpWordBoundary: + b.WriteString(`\b`) + case syntax.OpNoWordBoundary: + b.WriteString(`\B`) + case syntax.OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != syntax.OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } + b.WriteRune(')') + case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: + if sub := re.Sub[0]; sub.Op > syntax.OpCapture || sub.Op == syntax.OpLiteral && len(sub.Rune) > 1 { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case syntax.OpStar: + b.WriteRune('*') + case syntax.OpPlus: + b.WriteRune('+') + case syntax.OpQuest: + b.WriteRune('?') + case syntax.OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + if re.Flags&syntax.NonGreedy != 0 { + b.WriteRune('?') + } + case syntax.OpConcat: + for _, sub := range re.Sub { + if sub.Op == syntax.OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case syntax.OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func RegexpString(re *syntax.Regexp) string { + var b strings.Builder + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *strings.Builder, r rune, force bool) { + if unicode.IsPrint(r) { + if strings.ContainsRune(meta, r) || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.FormatInt(int64(r), 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.FormatInt(int64(r), 16)) + b.WriteString(`}`) + } +} diff --git a/matchtree.go b/matchtree.go index bd0f29b53..6706ecf0f 100644 --- a/matchtree.go +++ b/matchtree.go @@ -24,6 +24,7 @@ import ( "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/syntaxutil" "github.com/sourcegraph/zoekt/query" ) @@ -204,7 +205,7 @@ func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { } return ®expMatchTree{ - regexp: regexp.MustCompile(prefix + s.Regexp.String()), + regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), origRegexp: s.Regexp, fileName: s.FileName, } diff --git a/matchtree_test.go b/matchtree_test.go index 0f56f7623..1fece0f5b 100644 --- a/matchtree_test.go +++ b/matchtree_test.go @@ -238,9 +238,9 @@ func TestSymbolMatchTree(t *testing.T) { regex string regexAll bool }{ - {query: "sym:.*", regex: "(?i)(?-s:.*)", regexAll: true}, + {query: "sym:.*", regex: "(?i)(?-s:.)*", regexAll: true}, {query: "sym:(ab|cd)", regex: "(?i)ab|cd"}, - {query: "sym:b.r", regex: "(?i)(?-s:b.r)"}, + {query: "sym:b.r", regex: "(?i)b(?-s:.)r"}, {query: "sym:horse", substr: "horse"}, {query: `sym:\bthread\b case:yes`, regex: `\bthread\b`}, // check we disable word search opt {query: `sym:\bthread\b case:no`, regex: `(?i)\bthread\b`}, diff --git a/query/query.go b/query/query.go index 479a0bb46..d306bce06 100644 --- a/query/query.go +++ b/query/query.go @@ -29,6 +29,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var _ = log.Println @@ -99,7 +100,7 @@ func (q *Regexp) String() string { if q.CaseSensitive { pref = "case_" + pref } - return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String()) + return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp)) } // gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which @@ -112,7 +113,7 @@ type gobRegexp struct { // GobEncode implements gob.Encoder. func (q Regexp) GobEncode() ([]byte, error) { - gobq := gobRegexp{Regexp: q, RegexpString: q.Regexp.String()} + gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)} gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded return json.Marshal(gobq) } @@ -457,7 +458,7 @@ func (q *Regexp) setCase(k string) { case "no": q.CaseSensitive = false case "auto": - q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String()) + q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp)) } } diff --git a/query/regexp.go b/query/regexp.go index 21d7e8a31..889842fdd 100644 --- a/query/regexp.go +++ b/query/regexp.go @@ -17,6 +17,8 @@ package query import ( "log" "regexp/syntax" + + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var _ = log.Println @@ -56,7 +58,7 @@ func convertCapture(re *syntax.Regexp, flags syntax.Flags) *syntax.Regexp { } // Make a copy so in unlikely event of an error the original can be used as a fallback - r, err := syntax.Parse(re.String(), flags) + r, err := syntax.Parse(syntaxutil.RegexpString(re), flags) if err != nil { log.Printf("failed to copy regexp `%s`: %v", re, err) return re @@ -65,7 +67,7 @@ func convertCapture(re *syntax.Regexp, flags syntax.Flags) *syntax.Regexp { r = uncapture(r) // Parse again for new structure to take effect - r, err = syntax.Parse(r.String(), flags) + r, err = syntax.Parse(syntaxutil.RegexpString(r), flags) if err != nil { log.Printf("failed to parse regexp after uncapture `%s`: %v", r, err) return re diff --git a/query/regexp_test.go b/query/regexp_test.go index 4bfe3747a..27a9dbab4 100644 --- a/query/regexp_test.go +++ b/query/regexp_test.go @@ -18,6 +18,8 @@ import ( "regexp/syntax" "strings" "testing" + + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var opnames = map[syntax.Op]string{ @@ -52,7 +54,7 @@ func printRegexp(t *testing.T, r *syntax.Regexp, lvl int) { func TestLowerRegexp(t *testing.T) { in := "[a-zA-Z]fooBAR" re := mustParseRE(in) - in = re.String() + in = syntaxutil.RegexpString(re) got := LowerRegexp(re) want := "[a-za-z]foobar" if got.String() != want { @@ -61,8 +63,8 @@ func TestLowerRegexp(t *testing.T) { t.Errorf("got %s, want %s", got, want) } - if re.String() != in { - t.Errorf("got mutated original %s want %s", re.String(), in) + if orig := syntaxutil.RegexpString(re); orig != in { + t.Errorf("got mutated original %s want %s", orig, in) } }