From 82a8c6abe23a2afa3d7cbf3ccebc35e59d646dbb Mon Sep 17 00:00:00 2001
From: Bryan Boreham <bjboreham@gmail.com>
Date: Wed, 3 Jul 2024 18:45:36 +0100
Subject: [PATCH 1/2] [ENHANCEMENT] Optimize regexps with multiple prefixes
 (#13843)

For example `foo.*|bar.*|baz.*`. Instead of checking each one in turn,
we build a map of prefixes, then check the smaller set that could match
the string supplied.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Improve testing and readability

Address review comments on #13843

Signed-off-by: Marco Pracucci <marco@pracucci.com>
---
 model/labels/regexp.go      | 129 +++++++++++++----
 model/labels/regexp_test.go | 271 +++++++++++++++++++++++++++++-------
 2 files changed, 319 insertions(+), 81 deletions(-)

diff --git a/model/labels/regexp.go b/model/labels/regexp.go
index 767bd6942f2..d2151d83ddb 100644
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@@ -28,7 +28,7 @@ const (
 	maxSetMatches = 256
 
 	// The minimum number of alternate values a regex should have to trigger
-	// the optimization done by optimizeEqualStringMatchers() and so use a map
+	// the optimization done by optimizeEqualOrPrefixStringMatchers() and so use a map
 	// to match values instead of iterating over a list. This value has
 	// been computed running BenchmarkOptimizeEqualStringMatchers.
 	minEqualMultiStringMatcherMapThreshold = 16
@@ -337,7 +337,7 @@ func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
 		return nil, nil
 	}
 
-	multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates)
+	multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates, 0, 0)
 
 	for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') {
 		// Split the string into the next literal and the remainder
@@ -412,7 +412,7 @@ func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
 	clearBeginEndText(re)
 
 	m := stringMatcherFromRegexpInternal(re)
-	m = optimizeEqualStringMatchers(m, minEqualMultiStringMatcherMapThreshold)
+	m = optimizeEqualOrPrefixStringMatchers(m, minEqualMultiStringMatcherMapThreshold)
 
 	return m
 }
@@ -732,17 +732,20 @@ func (m *equalStringMatcher) Matches(s string) bool {
 type multiStringMatcherBuilder interface {
 	StringMatcher
 	add(s string)
+	addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher)
 	setMatches() []string
 }
 
-func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize int) multiStringMatcherBuilder {
+func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize, estimatedPrefixes, minPrefixLength int) multiStringMatcherBuilder {
 	// If the estimated size is low enough, it's faster to use a slice instead of a map.
-	if estimatedSize < minEqualMultiStringMatcherMapThreshold {
+	if estimatedSize < minEqualMultiStringMatcherMapThreshold && estimatedPrefixes == 0 {
 		return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)}
 	}
 
 	return &equalMultiStringMapMatcher{
 		values:        make(map[string]struct{}, estimatedSize),
+		prefixes:      make(map[string][]StringMatcher, estimatedPrefixes),
+		minPrefixLen:  minPrefixLength,
 		caseSensitive: caseSensitive,
 	}
 }
@@ -758,6 +761,10 @@ func (m *equalMultiStringSliceMatcher) add(s string) {
 	m.values = append(m.values, s)
 }
 
+func (m *equalMultiStringSliceMatcher) addPrefix(_ string, _ bool, _ StringMatcher) {
+	panic("not implemented")
+}
+
 func (m *equalMultiStringSliceMatcher) setMatches() []string {
 	return m.values
 }
@@ -779,12 +786,17 @@ func (m *equalMultiStringSliceMatcher) Matches(s string) bool {
 	return false
 }
 
-// equalMultiStringMapMatcher matches a string exactly against a map of valid values.
+// equalMultiStringMapMatcher matches a string exactly against a map of valid values
+// or against a set of prefix matchers.
 type equalMultiStringMapMatcher struct {
 	// values contains values to match a string against. If the matching is case insensitive,
 	// the values here must be lowercase.
 	values map[string]struct{}
-
+	// prefixes maps strings, all of length minPrefixLen, to sets of matchers to check the rest of the string.
+	// If the matching is case insensitive, prefixes are all lowercase.
+	prefixes map[string][]StringMatcher
+	// minPrefixLen can be zero, meaning there are no prefix matchers.
+	minPrefixLen  int
 	caseSensitive bool
 }
 
@@ -796,8 +808,27 @@ func (m *equalMultiStringMapMatcher) add(s string) {
 	m.values[s] = struct{}{}
 }
 
+func (m *equalMultiStringMapMatcher) addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) {
+	if m.minPrefixLen == 0 {
+		panic("addPrefix called when no prefix length defined")
+	}
+	if len(prefix) < m.minPrefixLen {
+		panic("addPrefix called with a too short prefix")
+	}
+	if m.caseSensitive != prefixCaseSensitive {
+		panic("addPrefix called with a prefix whose case sensitivity is different than the expected one")
+	}
+
+	s := prefix[:m.minPrefixLen]
+	if !m.caseSensitive {
+		s = strings.ToLower(s)
+	}
+
+	m.prefixes[s] = append(m.prefixes[s], matcher)
+}
+
 func (m *equalMultiStringMapMatcher) setMatches() []string {
-	if len(m.values) >= maxSetMatches {
+	if len(m.values) >= maxSetMatches || len(m.prefixes) > 0 {
 		return nil
 	}
 
@@ -813,8 +844,17 @@ func (m *equalMultiStringMapMatcher) Matches(s string) bool {
 		s = toNormalisedLower(s)
 	}
 
-	_, ok := m.values[s]
-	return ok
+	if _, ok := m.values[s]; ok {
+		return true
+	}
+	if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen {
+		for _, matcher := range m.prefixes[s[:m.minPrefixLen]] {
+			if matcher.Matches(s) {
+				return true
+			}
+		}
+	}
+	return false
 }
 
 // toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert
@@ -897,20 +937,24 @@ func (m trueMatcher) Matches(_ string) bool {
 	return true
 }
 
-// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
-// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
-// this specific case, when we have many strings to match against we can use a map instead
+// optimizeEqualOrPrefixStringMatchers optimize a specific case where all matchers are made by an
+// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher) or
+// with a literal prefix (literalPrefixSensitiveStringMatcher or literalPrefixInsensitiveStringMatcher).
+//
+// In this specific case, when we have many strings to match against we can use a map instead
 // of iterating over the list of strings.
-func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
+func optimizeEqualOrPrefixStringMatchers(input StringMatcher, threshold int) StringMatcher {
 	var (
 		caseSensitive    bool
 		caseSensitiveSet bool
 		numValues        int
+		numPrefixes      int
+		minPrefixLength  int
 	)
 
 	// Analyse the input StringMatcher to count the number of occurrences
 	// and ensure all of them have the same case sensitivity.
-	analyseCallback := func(matcher *equalStringMatcher) bool {
+	analyseEqualMatcherCallback := func(matcher *equalStringMatcher) bool {
 		// Ensure we don't have mixed case sensitivity.
 		if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
 			return false
@@ -923,34 +967,55 @@ func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatch
 		return true
 	}
 
-	if !findEqualStringMatchers(input, analyseCallback) {
+	analysePrefixMatcherCallback := func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool {
+		// Ensure we don't have mixed case sensitivity.
+		if caseSensitiveSet && caseSensitive != prefixCaseSensitive {
+			return false
+		} else if !caseSensitiveSet {
+			caseSensitive = prefixCaseSensitive
+			caseSensitiveSet = true
+		}
+		if numPrefixes == 0 || len(prefix) < minPrefixLength {
+			minPrefixLength = len(prefix)
+		}
+
+		numPrefixes++
+		return true
+	}
+
+	if !findEqualOrPrefixStringMatchers(input, analyseEqualMatcherCallback, analysePrefixMatcherCallback) {
 		return input
 	}
 
-	// If the number of values found is less than the threshold, then we should skip the optimization.
-	if numValues < threshold {
+	// If the number of values and prefixes found is less than the threshold, then we should skip the optimization.
+	if (numValues + numPrefixes) < threshold {
 		return input
 	}
 
 	// Parse again the input StringMatcher to extract all values and storing them.
 	// We can skip the case sensitivity check because we've already checked it and
 	// if the code reach this point then it means all matchers have the same case sensitivity.
-	multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues)
+	multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues, numPrefixes, minPrefixLength)
 
 	// Ignore the return value because we already iterated over the input StringMatcher
 	// and it was all good.
-	findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
+	findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool {
 		multiMatcher.add(matcher.s)
 		return true
+	}, func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool {
+		multiMatcher.addPrefix(prefix, caseSensitive, matcher)
+		return true
 	})
 
 	return multiMatcher
 }
 
-// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
-// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
-// composed by an alternation of equalStringMatcher.
-func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
+// findEqualOrPrefixStringMatchers analyze the input StringMatcher and calls the equalMatcherCallback for each
+// equalStringMatcher found, and prefixMatcherCallback for each literalPrefixSensitiveStringMatcher and literalPrefixInsensitiveStringMatcher found.
+//
+// Returns true if and only if the input StringMatcher is *only* composed by an alternation of equalStringMatcher and/or
+// literal prefix matcher. Returns false if prefixMatcherCallback is nil and a literal prefix matcher is encountered.
+func findEqualOrPrefixStringMatchers(input StringMatcher, equalMatcherCallback func(matcher *equalStringMatcher) bool, prefixMatcherCallback func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool) bool {
 	orInput, ok := input.(orStringMatcher)
 	if !ok {
 		return false
@@ -959,17 +1024,27 @@ func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalSt
 	for _, m := range orInput {
 		switch casted := m.(type) {
 		case orStringMatcher:
-			if !findEqualStringMatchers(m, callback) {
+			if !findEqualOrPrefixStringMatchers(m, equalMatcherCallback, prefixMatcherCallback) {
 				return false
 			}
 
 		case *equalStringMatcher:
-			if !callback(casted) {
+			if !equalMatcherCallback(casted) {
+				return false
+			}
+
+		case *literalPrefixSensitiveStringMatcher:
+			if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, true, casted) {
+				return false
+			}
+
+		case *literalPrefixInsensitiveStringMatcher:
+			if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, false, casted) {
 				return false
 			}
 
 		default:
-			// It's not an equal string matcher, so we have to stop searching
+			// It's not an equal or prefix string matcher, so we have to stop searching
 			// cause this optimization can't be applied.
 			return false
 		}
diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go
index fa5c96f4204..24875e64ef3 100644
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
@@ -71,6 +71,8 @@ var (
 		// A long case insensitive alternation.
 		"(?i:(zQPbMkNO|NNSPdvMi|iWuuSoAl|qbvKMimS|IecrXtPa|seTckYqt|NxnyHkgB|fIDlOgKb|UhlWIygH|OtNoJxHG|cUTkFVIV|mTgFIHjr|jQkoIDtE|PPMKxRXl|AwMfwVkQ|CQyMrTQJ|BzrqxVSi|nTpcWuhF|PertdywG|ZZDgCtXN|WWdDPyyE|uVtNQsKk|BdeCHvPZ|wshRnFlH|aOUIitIp|RxZeCdXT|CFZMslCj|AVBZRDxl|IzIGCnhw|ythYuWiz|oztXVXhl|VbLkwqQx|qvaUgyVC|VawUjPWC|ecloYJuj|boCLTdSU|uPrKeAZx|hrMWLWBq|JOnUNHRM|rYnujkPq|dDEdZhIj|DRrfvugG|yEGfDxVV|YMYdJWuP|PHUQZNWM|AmKNrLis|zTxndVfn|FPsHoJnc|EIulZTua|KlAPhdzg|ScHJJCLt|NtTfMzME|eMCwuFdo|SEpJVJbR|cdhXZeCx|sAVtBwRh|kVFEVcMI|jzJrxraA|tGLHTell|NNWoeSaw|DcOKSetX|UXZAJyka|THpMphDP|rizheevl|kDCBRidd|pCZZRqyu|pSygkitl|SwZGkAaW|wILOrfNX|QkwVOerj|kHOMxPDr|EwOVycJv|AJvtzQFS|yEOjKYYB|LizIINLL|JBRSsfcG|YPiUqqNl|IsdEbvee|MjEpGcBm|OxXZVgEQ|xClXGuxa|UzRCGFEb|buJbvfvA|IPZQxRet|oFYShsMc|oBHffuHO|bzzKrcBR|KAjzrGCl|IPUsAVls|OGMUMbIU|gyDccHuR|bjlalnDd|ZLWjeMna|fdsuIlxQ|dVXtiomV|XxedTjNg|XWMHlNoA|nnyqArQX|opfkWGhb|wYtnhdYb))",
 		"(?i:(AAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBB|cccccccccccccccccccccccC|ſſſſſſſſſſſſſſſſſſſſſſſſS|SSSSSSSSSSSSSSSSSSSSSSSSſ))",
+		// A short case insensitive alternation where each entry ends with ".*".
+		"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*))",
 		// A long case insensitive alternation where each entry ends with ".*".
 		"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))",
 		// A long case insensitive alternation where each entry starts with ".*".
@@ -686,7 +688,15 @@ func randStrings(randGenerator *rand.Rand, many, length int) []string {
 	return out
 }
 
-func TestOptimizeEqualStringMatchers(t *testing.T) {
+func randStringsWithSuffix(randGenerator *rand.Rand, many, length int, suffix string) []string {
+	out := randStrings(randGenerator, many, length)
+	for i := range out {
+		out[i] += suffix
+	}
+	return out
+}
+
+func TestOptimizeEqualOrPrefixStringMatchers(t *testing.T) {
 	tests := map[string]struct {
 		input                 StringMatcher
 		expectedValues        []string
@@ -767,7 +777,7 @@ func TestOptimizeEqualStringMatchers(t *testing.T) {
 
 	for testName, testData := range tests {
 		t.Run(testName, func(t *testing.T) {
-			actualMatcher := optimizeEqualStringMatchers(testData.input, 0)
+			actualMatcher := optimizeEqualOrPrefixStringMatchers(testData.input, 0)
 
 			if testData.expectedValues == nil {
 				require.IsType(t, testData.input, actualMatcher)
@@ -782,10 +792,12 @@ func TestOptimizeEqualStringMatchers(t *testing.T) {
 
 func TestNewEqualMultiStringMatcher(t *testing.T) {
 	tests := map[string]struct {
-		values             []string
-		caseSensitive      bool
-		expectedValuesMap  map[string]struct{}
-		expectedValuesList []string
+		values                []string
+		caseSensitivePrefixes []*literalPrefixSensitiveStringMatcher
+		caseSensitive         bool
+		expectedValuesMap     map[string]struct{}
+		expectedPrefixesMap   map[string][]StringMatcher
+		expectedValuesList    []string
 	}{
 		"few case sensitive values": {
 			values:             []string{"a", "B"},
@@ -797,27 +809,47 @@ func TestNewEqualMultiStringMatcher(t *testing.T) {
 			caseSensitive:      false,
 			expectedValuesList: []string{"a", "B"},
 		},
+		"few case sensitive values and prefixes": {
+			values:                []string{"a"},
+			caseSensitivePrefixes: []*literalPrefixSensitiveStringMatcher{{prefix: "B", right: anyStringWithoutNewlineMatcher{}}},
+			caseSensitive:         true,
+			expectedValuesMap:     map[string]struct{}{"a": {}},
+			expectedPrefixesMap:   map[string][]StringMatcher{"B": {&literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}}},
+		},
 		"many case sensitive values": {
-			values:            []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
-			caseSensitive:     true,
-			expectedValuesMap: map[string]struct{}{"a": {}, "B": {}, "c": {}, "D": {}, "e": {}, "F": {}, "g": {}, "H": {}, "i": {}, "L": {}, "m": {}, "N": {}, "o": {}, "P": {}, "q": {}, "r": {}},
+			values:              []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
+			caseSensitive:       true,
+			expectedValuesMap:   map[string]struct{}{"a": {}, "B": {}, "c": {}, "D": {}, "e": {}, "F": {}, "g": {}, "H": {}, "i": {}, "L": {}, "m": {}, "N": {}, "o": {}, "P": {}, "q": {}, "r": {}},
+			expectedPrefixesMap: map[string][]StringMatcher{},
 		},
 		"many case insensitive values": {
-			values:            []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
-			caseSensitive:     false,
-			expectedValuesMap: map[string]struct{}{"a": {}, "b": {}, "c": {}, "d": {}, "e": {}, "f": {}, "g": {}, "h": {}, "i": {}, "l": {}, "m": {}, "n": {}, "o": {}, "p": {}, "q": {}, "r": {}},
+			values:              []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
+			caseSensitive:       false,
+			expectedValuesMap:   map[string]struct{}{"a": {}, "b": {}, "c": {}, "d": {}, "e": {}, "f": {}, "g": {}, "h": {}, "i": {}, "l": {}, "m": {}, "n": {}, "o": {}, "p": {}, "q": {}, "r": {}},
+			expectedPrefixesMap: map[string][]StringMatcher{},
 		},
 	}
 
 	for testName, testData := range tests {
 		t.Run(testName, func(t *testing.T) {
-			matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values))
+			// To keep this test simple, we always assume a min prefix length of 1.
+			minPrefixLength := 0
+			if len(testData.caseSensitivePrefixes) > 0 {
+				minPrefixLength = 1
+			}
+
+			matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values), len(testData.caseSensitivePrefixes), minPrefixLength)
 			for _, v := range testData.values {
 				matcher.add(v)
 			}
-			if testData.expectedValuesMap != nil {
+			for _, p := range testData.caseSensitivePrefixes {
+				matcher.addPrefix(p.prefix, true, p)
+			}
+
+			if testData.expectedValuesMap != nil || testData.expectedPrefixesMap != nil {
 				require.IsType(t, &equalMultiStringMapMatcher{}, matcher)
 				require.Equal(t, testData.expectedValuesMap, matcher.(*equalMultiStringMapMatcher).values)
+				require.Equal(t, testData.expectedPrefixesMap, matcher.(*equalMultiStringMapMatcher).prefixes)
 				require.Equal(t, testData.caseSensitive, matcher.(*equalMultiStringMapMatcher).caseSensitive)
 			}
 			if testData.expectedValuesList != nil {
@@ -829,9 +861,32 @@ func TestNewEqualMultiStringMatcher(t *testing.T) {
 	}
 }
 
+func TestEqualMultiStringMapMatcher_addPrefix(t *testing.T) {
+	t.Run("should panic if the matcher is case sensitive but the prefix is not case sensitive", func(t *testing.T) {
+		matcher := newEqualMultiStringMatcher(true, 0, 1, 1)
+
+		require.Panics(t, func() {
+			matcher.addPrefix("a", false, &literalPrefixInsensitiveStringMatcher{
+				prefix: "a",
+			})
+		})
+	})
+
+	t.Run("should panic if the matcher is not case sensitive but the prefix is case sensitive", func(t *testing.T) {
+		matcher := newEqualMultiStringMatcher(false, 0, 1, 1)
+
+		require.Panics(t, func() {
+			matcher.addPrefix("a", true, &literalPrefixSensitiveStringMatcher{
+				prefix: "a",
+			})
+		})
+	})
+}
+
 func TestEqualMultiStringMatcher_Matches(t *testing.T) {
 	tests := map[string]struct {
 		values             []string
+		prefixes           []StringMatcher
 		caseSensitive      bool
 		expectedMatches    []string
 		expectedNotMatches []string
@@ -848,6 +903,24 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) {
 			expectedMatches:    []string{"a", "A", "b", "B"},
 			expectedNotMatches: []string{"c", "C"},
 		},
+		"few case sensitive prefixes": {
+			prefixes: []StringMatcher{
+				&literalPrefixSensitiveStringMatcher{prefix: "a", right: anyStringWithoutNewlineMatcher{}},
+				&literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}},
+			},
+			caseSensitive:      true,
+			expectedMatches:    []string{"a", "aX", "B", "BX"},
+			expectedNotMatches: []string{"A", "b"},
+		},
+		"few case insensitive prefixes": {
+			prefixes: []StringMatcher{
+				&literalPrefixInsensitiveStringMatcher{prefix: "a", right: anyStringWithoutNewlineMatcher{}},
+				&literalPrefixInsensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}},
+			},
+			caseSensitive:      false,
+			expectedMatches:    []string{"a", "aX", "A", "AX", "b", "bX", "B", "BX"},
+			expectedNotMatches: []string{"c", "cX", "C", "CX"},
+		},
 		"many case sensitive values": {
 			values:             []string{"a", "B", "c", "D", "e", "F", "g", "H", "i", "L", "m", "N", "o", "P", "q", "r"},
 			caseSensitive:      true,
@@ -860,14 +933,37 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) {
 			expectedMatches:    []string{"a", "A", "b", "B"},
 			expectedNotMatches: []string{"x", "X"},
 		},
+		"mixed values and prefixes": {
+			values:             []string{"a"},
+			prefixes:           []StringMatcher{&literalPrefixSensitiveStringMatcher{prefix: "B", right: anyStringWithoutNewlineMatcher{}}},
+			caseSensitive:      true,
+			expectedMatches:    []string{"a", "B", "BX"},
+			expectedNotMatches: []string{"aX", "A", "b", "bX"},
+		},
 	}
 
 	for testName, testData := range tests {
 		t.Run(testName, func(t *testing.T) {
-			matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values))
+			// To keep this test simple, we always assume a min prefix length of 1.
+			minPrefixLength := 0
+			if len(testData.prefixes) > 0 {
+				minPrefixLength = 1
+			}
+
+			matcher := newEqualMultiStringMatcher(testData.caseSensitive, len(testData.values), len(testData.prefixes), minPrefixLength)
 			for _, v := range testData.values {
 				matcher.add(v)
 			}
+			for _, p := range testData.prefixes {
+				switch m := p.(type) {
+				case *literalPrefixSensitiveStringMatcher:
+					matcher.addPrefix(m.prefix, true, p)
+				case *literalPrefixInsensitiveStringMatcher:
+					matcher.addPrefix(m.prefix, false, p)
+				default:
+					panic("Unexpected type in test case")
+				}
+			}
 
 			for _, v := range testData.expectedMatches {
 				require.True(t, matcher.Matches(v), "value: %s", v)
@@ -879,29 +975,33 @@ func TestEqualMultiStringMatcher_Matches(t *testing.T) {
 	}
 }
 
-func TestFindEqualStringMatchers(t *testing.T) {
+func TestFindEqualOrPrefixStringMatchers(t *testing.T) {
 	type match struct {
 		s             string
 		caseSensitive bool
 	}
 
-	// Utility to call findEqualStringMatchers() and collect all callback invocations.
-	findEqualStringMatchersAndCollectMatches := func(input StringMatcher) (matches []match, ok bool) {
-		ok = findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
+	// Utility to call findEqualOrPrefixStringMatchers() and collect all callback invocations.
+	findEqualOrPrefixStringMatchersAndCollectMatches := func(input StringMatcher) (matches []match, ok bool) {
+		ok = findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool {
 			matches = append(matches, match{matcher.s, matcher.caseSensitive})
 			return true
+		}, func(prefix string, prefixCaseSensitive bool, right StringMatcher) bool {
+			matches = append(matches, match{prefix, prefixCaseSensitive})
+			return true
 		})
+
 		return
 	}
 
 	t.Run("empty matcher", func(t *testing.T) {
-		actualMatches, actualOk := findEqualStringMatchersAndCollectMatches(emptyStringMatcher{})
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(emptyStringMatcher{})
 		require.False(t, actualOk)
 		require.Empty(t, actualMatches)
 	})
 
 	t.Run("concat of literal matchers (case sensitive)", func(t *testing.T) {
-		actualMatches, actualOk := findEqualStringMatchersAndCollectMatches(
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
 			orStringMatcher{
 				&equalStringMatcher{s: "test-1", caseSensitive: true},
 				&equalStringMatcher{s: "test-2", caseSensitive: true},
@@ -913,7 +1013,7 @@ func TestFindEqualStringMatchers(t *testing.T) {
 	})
 
 	t.Run("concat of literal matchers (case insensitive)", func(t *testing.T) {
-		actualMatches, actualOk := findEqualStringMatchersAndCollectMatches(
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
 			orStringMatcher{
 				&equalStringMatcher{s: "test-1", caseSensitive: false},
 				&equalStringMatcher{s: "test-2", caseSensitive: false},
@@ -925,7 +1025,7 @@ func TestFindEqualStringMatchers(t *testing.T) {
 	})
 
 	t.Run("concat of literal matchers (mixed case)", func(t *testing.T) {
-		actualMatches, actualOk := findEqualStringMatchersAndCollectMatches(
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
 			orStringMatcher{
 				&equalStringMatcher{s: "test-1", caseSensitive: false},
 				&equalStringMatcher{s: "test-2", caseSensitive: true},
@@ -935,11 +1035,59 @@ func TestFindEqualStringMatchers(t *testing.T) {
 		require.True(t, actualOk)
 		require.Equal(t, []match{{"test-1", false}, {"test-2", true}}, actualMatches)
 	})
+
+	t.Run("concat of literal prefix matchers (case sensitive)", func(t *testing.T) {
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
+			orStringMatcher{
+				&literalPrefixSensitiveStringMatcher{prefix: "test-1"},
+				&literalPrefixSensitiveStringMatcher{prefix: "test-2"},
+			},
+		)
+
+		require.True(t, actualOk)
+		require.Equal(t, []match{{"test-1", true}, {"test-2", true}}, actualMatches)
+	})
+
+	t.Run("concat of literal prefix matchers (case insensitive)", func(t *testing.T) {
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
+			orStringMatcher{
+				&literalPrefixInsensitiveStringMatcher{prefix: "test-1"},
+				&literalPrefixInsensitiveStringMatcher{prefix: "test-2"},
+			},
+		)
+
+		require.True(t, actualOk)
+		require.Equal(t, []match{{"test-1", false}, {"test-2", false}}, actualMatches)
+	})
+
+	t.Run("concat of literal prefix matchers (mixed case)", func(t *testing.T) {
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
+			orStringMatcher{
+				&literalPrefixInsensitiveStringMatcher{prefix: "test-1"},
+				&literalPrefixSensitiveStringMatcher{prefix: "test-2"},
+			},
+		)
+
+		require.True(t, actualOk)
+		require.Equal(t, []match{{"test-1", false}, {"test-2", true}}, actualMatches)
+	})
+
+	t.Run("concat of literal string and prefix matchers (case sensitive)", func(t *testing.T) {
+		actualMatches, actualOk := findEqualOrPrefixStringMatchersAndCollectMatches(
+			orStringMatcher{
+				&equalStringMatcher{s: "test-1", caseSensitive: true},
+				&literalPrefixSensitiveStringMatcher{prefix: "test-2"},
+			},
+		)
+
+		require.True(t, actualOk)
+		require.Equal(t, []match{{"test-1", true}, {"test-2", true}}, actualMatches)
+	})
 }
 
 // This benchmark is used to find a good threshold to use to apply the optimization
-// done by optimizeEqualStringMatchers().
-func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
+// done by optimizeEqualOrPrefixStringMatchers().
+func BenchmarkOptimizeEqualOrPrefixStringMatchers(b *testing.B) {
 	randGenerator := rand.New(rand.NewSource(time.Now().UnixNano()))
 
 	// Generate variable lengths random texts to match against.
@@ -949,42 +1097,51 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
 
 	for numAlternations := 2; numAlternations <= 256; numAlternations *= 2 {
 		for _, caseSensitive := range []bool{true, false} {
-			b.Run(fmt.Sprintf("alternations: %d case sensitive: %t", numAlternations, caseSensitive), func(b *testing.B) {
-				// Generate a regex with the expected number of alternations.
-				re := strings.Join(randStrings(randGenerator, numAlternations, 10), "|")
-				if !caseSensitive {
-					re = "(?i:(" + re + "))"
-				}
+			for _, prefixMatcher := range []bool{true, false} {
+				b.Run(fmt.Sprintf("alternations: %d case sensitive: %t prefix matcher: %t", numAlternations, caseSensitive, prefixMatcher), func(b *testing.B) {
+					// If the test should run on prefix matchers, we add a wildcard matcher as suffix (prefix will be a literal).
+					suffix := ""
+					if prefixMatcher {
+						suffix = ".*"
+					}
 
-				parsed, err := syntax.Parse(re, syntax.Perl)
-				require.NoError(b, err)
+					// Generate a regex with the expected number of alternations.
+					re := strings.Join(randStringsWithSuffix(randGenerator, numAlternations, 10, suffix), "|")
+					if !caseSensitive {
+						re = "(?i:(" + re + "))"
+					}
+					b.Logf("regexp: %s", re)
 
-				unoptimized := stringMatcherFromRegexpInternal(parsed)
-				require.IsType(b, orStringMatcher{}, unoptimized)
+					parsed, err := syntax.Parse(re, syntax.Perl)
+					require.NoError(b, err)
 
-				optimized := optimizeEqualStringMatchers(unoptimized, 0)
-				if numAlternations < minEqualMultiStringMatcherMapThreshold {
-					require.IsType(b, &equalMultiStringSliceMatcher{}, optimized)
-				} else {
-					require.IsType(b, &equalMultiStringMapMatcher{}, optimized)
-				}
+					unoptimized := stringMatcherFromRegexpInternal(parsed)
+					require.IsType(b, orStringMatcher{}, unoptimized)
 
-				b.Run("without optimizeEqualStringMatchers()", func(b *testing.B) {
-					for n := 0; n < b.N; n++ {
-						for _, t := range texts {
-							unoptimized.Matches(t)
-						}
+					optimized := optimizeEqualOrPrefixStringMatchers(unoptimized, 0)
+					if numAlternations < minEqualMultiStringMatcherMapThreshold && !prefixMatcher {
+						require.IsType(b, &equalMultiStringSliceMatcher{}, optimized)
+					} else {
+						require.IsType(b, &equalMultiStringMapMatcher{}, optimized)
 					}
-				})
 
-				b.Run("with optimizeEqualStringMatchers()", func(b *testing.B) {
-					for n := 0; n < b.N; n++ {
-						for _, t := range texts {
-							optimized.Matches(t)
+					b.Run("without optimizeEqualOrPrefixStringMatchers()", func(b *testing.B) {
+						for n := 0; n < b.N; n++ {
+							for _, t := range texts {
+								unoptimized.Matches(t)
+							}
+						}
+					})
+
+					b.Run("with optimizeEqualOrPrefixStringMatchers()", func(b *testing.B) {
+						for n := 0; n < b.N; n++ {
+							for _, t := range texts {
+								optimized.Matches(t)
+							}
 						}
-					}
+					})
 				})
-			})
+			}
 		}
 	}
 }
@@ -1204,10 +1361,16 @@ func visitStringMatcher(matcher StringMatcher, callback func(matcher StringMatch
 		}
 
 	// No nested matchers for the following ones.
+	case *equalMultiStringMapMatcher:
+		for _, prefixes := range casted.prefixes {
+			for _, matcher := range prefixes {
+				visitStringMatcher(matcher, callback)
+			}
+		}
+
 	case emptyStringMatcher:
 	case *equalStringMatcher:
 	case *equalMultiStringSliceMatcher:
-	case *equalMultiStringMapMatcher:
 	case anyStringWithoutNewlineMatcher:
 	case *anyNonEmptyStringMatcher:
 	case trueMatcher:

From d0747f38e44dba980be5ddadb61733b1c8b2aa50 Mon Sep 17 00:00:00 2001
From: Christian Simon <simon@swine.de>
Date: Wed, 7 Aug 2024 17:31:41 +0100
Subject: [PATCH 2/2] Add pyrobench

---
 .github/workflows/pyrobench.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/pyrobench.yaml

diff --git a/.github/workflows/pyrobench.yaml b/.github/workflows/pyrobench.yaml
new file mode 100644
index 00000000000..0fbc9e2e43b
--- /dev/null
+++ b/.github/workflows/pyrobench.yaml
@@ -0,0 +1,29 @@
+name: Pyrobench
+on: [pull_request]
+jobs:
+  test:
+    name: Bench
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Set up Go 1.22
+      uses: actions/setup-go@v5
+      with:
+        go-version: 1.22
+      id: go
+
+    - name: Check out code into the Go module directory
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 1024
+
+    - run: |
+        # Ensure base branch is fetched
+        git fetch origin "${GITHUB_BASE_REF}" --depth 1024
+        echo "GITHUB_BASE_REF=${GITHUB_BASE_REF}"
+
+    - name: Run Benchmark
+      run: |
+        go run github.com/grafana/pyrobench@26f65feff9ff5ad092fb21b6931be8b730554a86 -v compare --git-base origin/$GITHUB_BASE_REF --github-commenter
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}