diff --git a/internal/syntaxutil/README.md b/internal/syntaxutil/README.md new file mode 100644 index 000000000..2b551db32 --- /dev/null +++ b/internal/syntaxutil/README.md @@ -0,0 +1,58 @@ +# vendored std regexp/syntax + +This package contains a vendored copy of std regexp/syntax. However, it only +contains the code for converting syntax.Regexp into a String. It is the +version of the code at a recent go commit, but with a commit which introduces +a significant performance regression reverted. + +At the time of writing regexp.String on go1.22 is taking 40% of CPU at +Sourcegraph. This should return to ~0% with this vendored code. + +https://github.com/sourcegraph/sourcegraph/issues/61462 + +## Vendored commit + +``` +commit 2e1003e2f7e42efc5771812b9ee6ed264803796c +Author: Daniel Martí +Date: Tue Mar 26 22:59:41 2024 +0200 + + cmd/go: replace reflect.DeepEqual with slices.Equal and maps.Equal + + All of these maps and slices are made up of comparable types, + so we can avoid the overhead of reflection entirely. + + Change-Id: If77dbe648a336ba729c171e84c9ff3f7e160297d + Reviewed-on: https://go-review.googlesource.com/c/go/+/574597 + Reviewed-by: Than McIntosh + LUCI-TryBot-Result: Go LUCI + Reviewed-by: Ian Lance Taylor +``` + +## Reverted commit + +``` +commit 98c9f271d67b501ecf2ce995539abd2cdc81d505 +Author: Russ Cox +Date: Wed Jun 28 17:45:26 2023 -0400 + + regexp/syntax: use more compact Regexp.String output + + Compact the Regexp.String output. It was only ever intended for debugging, + but there are at least some uses in the wild where regexps are built up + using regexp/syntax and then formatted using the String method. + Compact the output to help that use case. Specifically: + + - Compact 2-element character class ranges: [a-b] -> [ab]. + - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). + + Fixes #57950. + + Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb + Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 + TryBot-Result: Gopher Robot + Reviewed-by: Than McIntosh + Run-TryBot: Russ Cox + Reviewed-by: Rob Pike + Auto-Submit: Russ Cox +``` diff --git a/internal/syntaxutil/alias_test.go b/internal/syntaxutil/alias_test.go new file mode 100644 index 000000000..c2a5d6a08 --- /dev/null +++ b/internal/syntaxutil/alias_test.go @@ -0,0 +1,51 @@ +package syntaxutil + +import "regexp/syntax" + +// A bunch of aliases to avoid needing to modify parse_test.go too much. + +type Regexp = syntax.Regexp + +type Op = syntax.Op + +const ( + OpNoMatch = syntax.OpNoMatch + OpEmptyMatch = syntax.OpEmptyMatch + OpLiteral = syntax.OpLiteral + OpCharClass = syntax.OpCharClass + OpAnyCharNotNL = syntax.OpAnyCharNotNL + OpAnyChar = syntax.OpAnyChar + OpBeginLine = syntax.OpBeginLine + OpEndLine = syntax.OpEndLine + OpBeginText = syntax.OpBeginText + OpEndText = syntax.OpEndText + OpWordBoundary = syntax.OpWordBoundary + OpNoWordBoundary = syntax.OpNoWordBoundary + OpCapture = syntax.OpCapture + OpStar = syntax.OpStar + OpPlus = syntax.OpPlus + OpQuest = syntax.OpQuest + OpRepeat = syntax.OpRepeat + OpConcat = syntax.OpConcat + OpAlternate = syntax.OpAlternate +) + +type Flags = syntax.Flags + +const ( + FoldCase = syntax.FoldCase + Literal = syntax.Literal + ClassNL = syntax.ClassNL + DotNL = syntax.DotNL + OneLine = syntax.OneLine + NonGreedy = syntax.NonGreedy + PerlX = syntax.PerlX + UnicodeGroups = syntax.UnicodeGroups + WasDollar = syntax.WasDollar + Simple = syntax.Simple + MatchNL = syntax.MatchNL + Perl = syntax.Perl + POSIX = syntax.POSIX +) + +var Parse = syntax.Parse diff --git a/internal/syntaxutil/parse_test.go b/internal/syntaxutil/parse_test.go new file mode 100644 index 000000000..23b8a916f --- /dev/null +++ b/internal/syntaxutil/parse_test.go @@ -0,0 +1,397 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntaxutil + +import ( + "fmt" + "strings" + "testing" + "unicode" +) + +type parseTest struct { + Regexp string + Dump string +} + +var parseTests = []parseTest{ + // Base cases + {`a`, `lit{a}`}, + {`a.`, `cat{lit{a}dot{}}`}, + {`a.b`, `cat{lit{a}dot{}lit{b}}`}, + {`ab`, `str{ab}`}, + {`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`}, + {`abc`, `str{abc}`}, + {`a|^`, `alt{lit{a}bol{}}`}, + {`a|b`, `cc{0x61-0x62}`}, + {`(a)`, `cap{lit{a}}`}, + {`(a)|b`, `alt{cap{lit{a}}lit{b}}`}, + {`a*`, `star{lit{a}}`}, + {`a+`, `plus{lit{a}}`}, + {`a?`, `que{lit{a}}`}, + {`a{2}`, `rep{2,2 lit{a}}`}, + {`a{2,3}`, `rep{2,3 lit{a}}`}, + {`a{2,}`, `rep{2,-1 lit{a}}`}, + {`a*?`, `nstar{lit{a}}`}, + {`a+?`, `nplus{lit{a}}`}, + {`a??`, `nque{lit{a}}`}, + {`a{2}?`, `nrep{2,2 lit{a}}`}, + {`a{2,3}?`, `nrep{2,3 lit{a}}`}, + {`a{2,}?`, `nrep{2,-1 lit{a}}`}, + // Malformed { } are treated as literals. + {`x{1001`, `str{x{1001}`}, + {`x{9876543210`, `str{x{9876543210}`}, + {`x{9876543210,`, `str{x{9876543210,}`}, + {`x{2,1`, `str{x{2,1}`}, + {`x{1,9876543210`, `str{x{1,9876543210}`}, + {``, `emp{}`}, + {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored + {`|x|`, `alt{emp{}lit{x}emp{}}`}, + {`.`, `dot{}`}, + {`^`, `bol{}`}, + {`$`, `eol{}`}, + {`\|`, `lit{|}`}, + {`\(`, `lit{(}`}, + {`\)`, `lit{)}`}, + {`\*`, `lit{*}`}, + {`\+`, `lit{+}`}, + {`\?`, `lit{?}`}, + {`{`, `lit{{}`}, + {`}`, `lit{}}`}, + {`\.`, `lit{.}`}, + {`\^`, `lit{^}`}, + {`\$`, `lit{$}`}, + {`\\`, `lit{\}`}, + {`[ace]`, `cc{0x61 0x63 0x65}`}, + {`[abc]`, `cc{0x61-0x63}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[a]`, `lit{a}`}, + {`\-`, `lit{-}`}, + {`-`, `lit{-}`}, + {`\_`, `lit{_}`}, + {`abc`, `str{abc}`}, + {`abc|def`, `alt{str{abc}str{def}}`}, + {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`}, + + // Posix and Perl extensions + {`[[:lower:]]`, `cc{0x61-0x7a}`}, + {`[a-z]`, `cc{0x61-0x7a}`}, + {`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`}, + {`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`\d`, `cc{0x30-0x39}`}, + {`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`}, + {`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`}, + {`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`}, + {`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`}, + {`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`}, + {`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`}, + {`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`}, + {`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`}, + // { `\C`, `byte{}` }, // probably never + + // Unicode, negatives, and a double negative. + {`\p{Braille}`, `cc{0x2800-0x28ff}`}, + {`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`\P{^Braille}`, `cc{0x2800-0x28ff}`}, + {`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`[\p{Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`}, + {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`}, + {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`}, + {`\p{Lu}`, mkCharClass(unicode.IsUpper)}, + {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)}, + {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)}, + {`\p{Any}`, `dot{}`}, + {`\p{^Any}`, `cc{}`}, + + // Hex, octal. + {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`}, + {`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`}, + + // More interesting regular expressions. + {`a{,2}`, `str{a{,2}}`}, + {`\.\^\$\\`, `str{.^$\}`}, + {`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`}, + {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`}, + {`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8 + {`a*{`, `cat{star{lit{a}}lit{{}}`}, + + // Test precedences + {`(?:ab)*`, `star{str{ab}}`}, + {`(ab)*`, `star{cap{str{ab}}}`}, + {`ab|cd`, `alt{str{ab}str{cd}}`}, + {`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`}, + + // Test flattening. + {`(?:a)`, `lit{a}`}, + {`(?:ab)(?:cd)`, `str{abcd}`}, + {`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`}, + {`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`}, + {`a|.`, `dot{}`}, + {`.|a`, `dot{}`}, + {`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`}, + {`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`}, + + // Test Perl quoted literals + {`\Q+|*?{[\E`, `str{+|*?{[}`}, + {`\Q+\E+`, `plus{lit{+}}`}, + {`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`}, + {`\Q\\E`, `lit{\}`}, + {`\Q\\\E`, `str{\\}`}, + + // Test Perl \A and \z + {`(?m)^`, `bol{}`}, + {`(?m)$`, `eol{}`}, + {`(?-m)^`, `bot{}`}, + {`(?-m)$`, `eot{}`}, + {`(?m)\A`, `bot{}`}, + {`(?m)\z`, `eot{\z}`}, + {`(?-m)\A`, `bot{}`}, + {`(?-m)\z`, `eot{\z}`}, + + // Test named captures + {`(?Pa)`, `cap{name:lit{a}}`}, + {`(?a)`, `cap{name:lit{a}}`}, + + // Case-folded literals + {`[Aa]`, `litfold{A}`}, + {`[\x{100}\x{101}]`, `litfold{Ā}`}, + {`[Δδ]`, `litfold{Δ}`}, + + // Strings + {`abcde`, `str{abcde}`}, + {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`}, + + // Factoring. + {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, + + // Bug fixes. + {`(?:.)`, `dot{}`}, + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, + {`(?:A|a)`, `litfold{A}`}, + {`A|(?:A|a)`, `litfold{A}`}, + {`(?s).`, `dot{}`}, + {`(?-s).`, `dnl{}`}, + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, + {`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`}, + + // RE2 prefix_tests + {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`}, + {`abc|abd|aef|bcx|bcy`, + `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` + + `cat{str{bc}cc{0x78-0x79}}}`}, + {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, + {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, + {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, + {`x{2}|x{2}[0-9]`, + `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, + {`x{2}y|x{2}[0-9]y`, + `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + {`a.*?c|a.*?b`, + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, + + // Valid repetitions. + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, + + // Valid nesting. + {strings.Repeat("(", 999) + strings.Repeat(")", 999), ``}, + {strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``}, + {"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b strings.Builder + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *strings.Builder, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.SimpleFold(r) != r { + b.WriteString("fold") + break + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} + +func mkCharClass(f func(rune) bool) string { + re := &Regexp{Op: OpCharClass} + lo := rune(-1) + for i := rune(0); i <= unicode.MaxRune; i++ { + if f(i) { + if lo < 0 { + lo = i + } + } else { + if lo >= 0 { + re.Rune = append(re.Rune, lo, i-1) + lo = -1 + } + } + } + if lo >= 0 { + re.Rune = append(re.Rune, lo, unicode.MaxRune) + } + return dump(re) +} + +func isUpperFold(r rune) bool { + if unicode.IsUpper(r) { + return true + } + c := unicode.SimpleFold(r) + for c != r { + if unicode.IsUpper(c) { + return true + } + c = unicode.SimpleFold(c) + } + return false +} + +func TestToStringEquivalentParse(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + continue + } + + s := re.String() + if s != tt.Regexp { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent in some contexts. + nre, err := Parse(s, testFlags) + if err != nil { + t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err) + continue + } + nd := dump(nre) + if d != nd { + t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd) + } + + ns := nre.String() + if s != ns { + t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns) + } + } + } +} diff --git a/internal/syntaxutil/regexp.go b/internal/syntaxutil/regexp.go new file mode 100644 index 000000000..cbbe616c6 --- /dev/null +++ b/internal/syntaxutil/regexp.go @@ -0,0 +1,192 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntaxutil + +import ( + "regexp/syntax" + "strconv" + "strings" + "unicode" +) + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *strings.Builder, re *syntax.Regexp) { + switch re.Op { + default: + b.WriteString("") + case syntax.OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case syntax.OpEmptyMatch: + b.WriteString(`(?:)`) + case syntax.OpLiteral: + if re.Flags&syntax.FoldCase != 0 { + b.WriteString(`(?i:`) + } + for _, r := range re.Rune { + escape(b, r, false) + } + if re.Flags&syntax.FoldCase != 0 { + b.WriteString(`)`) + } + case syntax.OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) == 0 { + b.WriteString(`^\x00-\x{10FFFF}`) + } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case syntax.OpAnyCharNotNL: + b.WriteString(`(?-s:.)`) + case syntax.OpAnyChar: + b.WriteString(`(?s:.)`) + case syntax.OpBeginLine: + b.WriteString(`(?m:^)`) + case syntax.OpEndLine: + b.WriteString(`(?m:$)`) + case syntax.OpBeginText: + b.WriteString(`\A`) + case syntax.OpEndText: + if re.Flags&syntax.WasDollar != 0 { + b.WriteString(`(?-m:$)`) + } else { + b.WriteString(`\z`) + } + case syntax.OpWordBoundary: + b.WriteString(`\b`) + case syntax.OpNoWordBoundary: + b.WriteString(`\B`) + case syntax.OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + if re.Sub[0].Op != syntax.OpEmptyMatch { + writeRegexp(b, re.Sub[0]) + } + b.WriteRune(')') + case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: + if sub := re.Sub[0]; sub.Op > syntax.OpCapture || sub.Op == syntax.OpLiteral && len(sub.Rune) > 1 { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case syntax.OpStar: + b.WriteRune('*') + case syntax.OpPlus: + b.WriteRune('+') + case syntax.OpQuest: + b.WriteRune('?') + case syntax.OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + if re.Flags&syntax.NonGreedy != 0 { + b.WriteRune('?') + } + case syntax.OpConcat: + for _, sub := range re.Sub { + if sub.Op == syntax.OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case syntax.OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func RegexpString(re *syntax.Regexp) string { + var b strings.Builder + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *strings.Builder, r rune, force bool) { + if unicode.IsPrint(r) { + if strings.ContainsRune(meta, r) || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.FormatInt(int64(r), 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\x{`) + b.WriteString(strconv.FormatInt(int64(r), 16)) + b.WriteString(`}`) + } +} diff --git a/matchtree.go b/matchtree.go index bd0f29b53..6706ecf0f 100644 --- a/matchtree.go +++ b/matchtree.go @@ -24,6 +24,7 @@ import ( "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/syntaxutil" "github.com/sourcegraph/zoekt/query" ) @@ -204,7 +205,7 @@ func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { } return ®expMatchTree{ - regexp: regexp.MustCompile(prefix + s.Regexp.String()), + regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), origRegexp: s.Regexp, fileName: s.FileName, } diff --git a/matchtree_test.go b/matchtree_test.go index 0f56f7623..1fece0f5b 100644 --- a/matchtree_test.go +++ b/matchtree_test.go @@ -238,9 +238,9 @@ func TestSymbolMatchTree(t *testing.T) { regex string regexAll bool }{ - {query: "sym:.*", regex: "(?i)(?-s:.*)", regexAll: true}, + {query: "sym:.*", regex: "(?i)(?-s:.)*", regexAll: true}, {query: "sym:(ab|cd)", regex: "(?i)ab|cd"}, - {query: "sym:b.r", regex: "(?i)(?-s:b.r)"}, + {query: "sym:b.r", regex: "(?i)b(?-s:.)r"}, {query: "sym:horse", substr: "horse"}, {query: `sym:\bthread\b case:yes`, regex: `\bthread\b`}, // check we disable word search opt {query: `sym:\bthread\b case:no`, regex: `(?i)\bthread\b`}, diff --git a/query/query.go b/query/query.go index 479a0bb46..d306bce06 100644 --- a/query/query.go +++ b/query/query.go @@ -29,6 +29,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var _ = log.Println @@ -99,7 +100,7 @@ func (q *Regexp) String() string { if q.CaseSensitive { pref = "case_" + pref } - return fmt.Sprintf("%sregex:%q", pref, q.Regexp.String()) + return fmt.Sprintf("%sregex:%q", pref, syntaxutil.RegexpString(q.Regexp)) } // gobRegexp wraps Regexp to make it gob-encodable/decodable. Regexp contains syntax.Regexp, which @@ -112,7 +113,7 @@ type gobRegexp struct { // GobEncode implements gob.Encoder. func (q Regexp) GobEncode() ([]byte, error) { - gobq := gobRegexp{Regexp: q, RegexpString: q.Regexp.String()} + gobq := gobRegexp{Regexp: q, RegexpString: syntaxutil.RegexpString(q.Regexp)} gobq.Regexp.Regexp = nil // can't be gob-encoded/decoded return json.Marshal(gobq) } @@ -457,7 +458,7 @@ func (q *Regexp) setCase(k string) { case "no": q.CaseSensitive = false case "auto": - q.CaseSensitive = (q.Regexp.String() != LowerRegexp(q.Regexp).String()) + q.CaseSensitive = !q.Regexp.Equal(LowerRegexp(q.Regexp)) } } diff --git a/query/regexp.go b/query/regexp.go index 21d7e8a31..889842fdd 100644 --- a/query/regexp.go +++ b/query/regexp.go @@ -17,6 +17,8 @@ package query import ( "log" "regexp/syntax" + + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var _ = log.Println @@ -56,7 +58,7 @@ func convertCapture(re *syntax.Regexp, flags syntax.Flags) *syntax.Regexp { } // Make a copy so in unlikely event of an error the original can be used as a fallback - r, err := syntax.Parse(re.String(), flags) + r, err := syntax.Parse(syntaxutil.RegexpString(re), flags) if err != nil { log.Printf("failed to copy regexp `%s`: %v", re, err) return re @@ -65,7 +67,7 @@ func convertCapture(re *syntax.Regexp, flags syntax.Flags) *syntax.Regexp { r = uncapture(r) // Parse again for new structure to take effect - r, err = syntax.Parse(r.String(), flags) + r, err = syntax.Parse(syntaxutil.RegexpString(r), flags) if err != nil { log.Printf("failed to parse regexp after uncapture `%s`: %v", r, err) return re diff --git a/query/regexp_test.go b/query/regexp_test.go index 4bfe3747a..27a9dbab4 100644 --- a/query/regexp_test.go +++ b/query/regexp_test.go @@ -18,6 +18,8 @@ import ( "regexp/syntax" "strings" "testing" + + "github.com/sourcegraph/zoekt/internal/syntaxutil" ) var opnames = map[syntax.Op]string{ @@ -52,7 +54,7 @@ func printRegexp(t *testing.T, r *syntax.Regexp, lvl int) { func TestLowerRegexp(t *testing.T) { in := "[a-zA-Z]fooBAR" re := mustParseRE(in) - in = re.String() + in = syntaxutil.RegexpString(re) got := LowerRegexp(re) want := "[a-za-z]foobar" if got.String() != want { @@ -61,8 +63,8 @@ func TestLowerRegexp(t *testing.T) { t.Errorf("got %s, want %s", got, want) } - if re.String() != in { - t.Errorf("got mutated original %s want %s", re.String(), in) + if orig := syntaxutil.RegexpString(re); orig != in { + t.Errorf("got mutated original %s want %s", orig, in) } }