From 6e7cd10a8f2840ac0bcba48b039f3d40afdf97b7 Mon Sep 17 00:00:00 2001 From: Kousik Mitra Date: Tue, 2 May 2023 23:41:19 +0530 Subject: [PATCH] Support to filter traces using `!~` (#2410) * Enable support for not regex operation * Add predicate to match not regex operations * Support for searching pattern with not regex op * Add doc for negated regex * Update changelog * Replace asserts with require * move dictString def to pkg level as testDictString * Add benchmark for regex predicate * Revert auto format changes in yaml file --- CHANGELOG.md | 3 +- docs/sources/tempo/traceql/_index.md | 1 + pkg/parquetquery/predicate_test.go | 200 ++++++++++++++---- pkg/parquetquery/predicates.go | 45 ++-- pkg/traceql/ast_validate.go | 3 +- pkg/traceql/test_examples.yaml | 6 +- tempodb/encoding/vparquet/block_traceql.go | 4 +- .../encoding/vparquet/block_traceql_test.go | 2 + tempodb/encoding/vparquet2/block_traceql.go | 4 +- .../encoding/vparquet2/block_traceql_test.go | 2 + 10 files changed, 202 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 867936c4d3a..9b691ffe498 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## main / unreleased -* [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.azure` and `storage.trace.gcs` [#2362](https://github.com/grafana/tempo/pull/2386) (@kousikmitra) +* [ENHANCEMENT] Add support to filter using negated regex operator `!~` [#2410](https://github.com/grafana/tempo/pull/2410) (@kousikmitra) +* [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.azure` and `storage.trace.gcs` [#2386](https://github.com/grafana/tempo/pull/2386) (@kousikmitra) * [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.s3` [#2362](https://github.com/grafana/tempo/pull/2362) (@kousikmitra) * [FEATURE] Add support for `q` query param in `/api/v2/search//values` to filter results based on a TraceQL query [#2253](https://github.com/grafana/tempo/pull/2253) (@mapno) * [FEATURE] Add a GRPC streaming endpoint for traceql search [#2366](https://github.com/grafana/tempo/pull/2366) (@joe-elliott) diff --git a/docs/sources/tempo/traceql/_index.md b/docs/sources/tempo/traceql/_index.md index eb5592e54ba..782c2fb14f0 100644 --- a/docs/sources/tempo/traceql/_index.md +++ b/docs/sources/tempo/traceql/_index.md @@ -120,6 +120,7 @@ The implemented comparison operators are: - `<` (less than) - `<=` (less than or equal to) - `=~` (regular expression) +- `!~` (negated regular expression) TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries. diff --git a/pkg/parquetquery/predicate_test.go b/pkg/parquetquery/predicate_test.go index c93e75e3765..cf36d9da349 100644 --- a/pkg/parquetquery/predicate_test.go +++ b/pkg/parquetquery/predicate_test.go @@ -10,8 +10,6 @@ import ( "github.com/stretchr/testify/require" ) -var _ Predicate = (*mockPredicate)(nil) - type mockPredicate struct { ret bool valCalled bool @@ -19,6 +17,12 @@ type mockPredicate struct { chunkCalled bool } +type testDictString struct { + S string `parquet:",dict"` +} + +var _ Predicate = (*mockPredicate)(nil) + func newAlwaysTruePredicate() *mockPredicate { return &mockPredicate{ret: true} } @@ -32,41 +36,140 @@ func (p *mockPredicate) KeepValue(parquet.Value) bool { p.valCalled func (p *mockPredicate) KeepPage(parquet.Page) bool { p.pageCalled = true; return p.ret } func (p *mockPredicate) KeepColumnChunk(parquet.ColumnChunk) bool { p.chunkCalled = true; return p.ret } +type predicateTestCase struct { + testName string + writeData func(w *parquet.Writer) //nolint:all + keptChunks int + keptPages int + keptValues int + predicate Predicate +} + func TestSubstringPredicate(t *testing.T) { + testCases := []predicateTestCase{ + { + testName: "all chunks/pages/values inspected", + predicate: NewSubstringPredicate("b"), + keptChunks: 1, + keptPages: 1, + keptValues: 2, + writeData: func(w *parquet.Writer) { //nolint:all + + require.NoError(t, w.Write(&testDictString{"abc"})) // kept + require.NoError(t, w.Write(&testDictString{"bcd"})) // kept + require.NoError(t, w.Write(&testDictString{"cde"})) // skipped + }, + }, + { + testName: "dictionary in the page header allows for skipping a page", + predicate: NewSubstringPredicate("x"), // Not present in any values + keptChunks: 1, + keptPages: 0, + keptValues: 0, + writeData: func(w *parquet.Writer) { //nolint:all + require.NoError(t, w.Write(&testDictString{"abc"})) + require.NoError(t, w.Write(&testDictString{"abc"})) + require.NoError(t, w.Write(&testDictString{"abc"})) + require.NoError(t, w.Write(&testDictString{"abc"})) + require.NoError(t, w.Write(&testDictString{"abc"})) + }, + }, + } + + for _, tC := range testCases { + t.Run(tC.testName, func(t *testing.T) { + testPredicate(t, tC) + }) + } +} + +func TestNewRegexInPredicate(t *testing.T) { + testCases := []predicateTestCase{ + { + testName: "all chunks/pages/values inspected", + predicate: func() Predicate { + pred, err := NewRegexInPredicate([]string{"a.*"}) + require.NoError(t, err) + + return pred + }(), + keptChunks: 1, + keptPages: 1, + keptValues: 2, + writeData: func(w *parquet.Writer) { //nolint:all + require.NoError(t, w.Write(&testDictString{"abc"})) // kept + require.NoError(t, w.Write(&testDictString{"acd"})) // kept + require.NoError(t, w.Write(&testDictString{"cde"})) // skipped + }, + }, + { + testName: "dictionary in the page header allows for skipping a page", + predicate: func() Predicate { + pred, err := NewRegexInPredicate([]string{"x.*"}) + require.NoError(t, err) + + return pred + }(), // Not present in any values + keptChunks: 1, + keptPages: 0, + keptValues: 0, + writeData: func(w *parquet.Writer) { //nolint:all + require.NoError(t, w.Write(&testDictString{"abc"})) + require.NoError(t, w.Write(&testDictString{"abc"})) + }, + }, + } - // Normal case - all chunks/pages/values inspected - testPredicate(t, predicateTestCase{ - predicate: NewSubstringPredicate("b"), - keptChunks: 1, - keptPages: 1, - keptValues: 2, - writeData: func(w *parquet.Writer) { //nolint:all - type String struct { - S string `parquet:",dict"` - } - require.NoError(t, w.Write(&String{"abc"})) // kept - require.NoError(t, w.Write(&String{"bcd"})) // kept - require.NoError(t, w.Write(&String{"cde"})) // skipped + for _, tC := range testCases { + t.Run(tC.testName, func(t *testing.T) { + testPredicate(t, tC) + }) + } +} + +func TestNewRegexNotInPredicate(t *testing.T) { + testCases := []predicateTestCase{ + { + testName: "all chunks/pages/values inspected", + predicate: func() Predicate { + pred, err := NewRegexNotInPredicate([]string{"a.*"}) + require.NoError(t, err) + + return pred + }(), + keptChunks: 1, + keptPages: 1, + keptValues: 2, + writeData: func(w *parquet.Writer) { //nolint:all + require.NoError(t, w.Write(&testDictString{"abc"})) // skipped + require.NoError(t, w.Write(&testDictString{"acd"})) // skipped + require.NoError(t, w.Write(&testDictString{"cde"})) // kept + require.NoError(t, w.Write(&testDictString{"xde"})) // kept + }, }, - }) - - // Dictionary in the page header allows for skipping a page - testPredicate(t, predicateTestCase{ - predicate: NewSubstringPredicate("x"), // Not present in any values - keptChunks: 1, - keptPages: 0, - keptValues: 0, - writeData: func(w *parquet.Writer) { //nolint:all - type dictString struct { - S string `parquet:",dict"` - } - require.NoError(t, w.Write(&dictString{"abc"})) - require.NoError(t, w.Write(&dictString{"abc"})) - require.NoError(t, w.Write(&dictString{"abc"})) - require.NoError(t, w.Write(&dictString{"abc"})) - require.NoError(t, w.Write(&dictString{"abc"})) + { + testName: "dictionary in the page header allows for skipping a page", + predicate: func() Predicate { + pred, err := NewRegexNotInPredicate([]string{"x.*"}) + require.NoError(t, err) + + return pred + }(), // Not present in any values + keptChunks: 1, + keptPages: 0, + keptValues: 0, + writeData: func(w *parquet.Writer) { //nolint:all + require.NoError(t, w.Write(&testDictString{"xyz"})) + require.NoError(t, w.Write(&testDictString{"xyz"})) + }, }, - }) + } + + for _, tC := range testCases { + t.Run(tC.testName, func(t *testing.T) { + testPredicate(t, tC) + }) + } } // TestOrPredicateCallsKeepColumnChunk ensures that the OrPredicate calls @@ -120,17 +223,10 @@ func TestOrPredicateCallsKeepColumnChunk(t *testing.T) { } } -type predicateTestCase struct { - writeData func(w *parquet.Writer) //nolint:all - keptChunks int - keptPages int - keptValues int - predicate Predicate -} - -// testPredicate by writing data and then iterating the column. The data model -// must contain a single column. +// testPredicate by writing data and then iterating the column. +// The data model must contain a single column. func testPredicate(t *testing.T, tc predicateTestCase) { + t.Helper() buf := new(bytes.Buffer) w := parquet.NewWriter(buf) tc.writeData(w) @@ -190,3 +286,21 @@ func BenchmarkStringInPredicate(b *testing.B) { } } } + +func BenchmarkRegexInPredicate(b *testing.B) { + p, err := NewRegexInPredicate([]string{"abc"}) + require.NoError(b, err) + + s := make([]parquet.Value, 1000) + for i := 0; i < 1000; i++ { + s[i] = parquet.ValueOf(uuid.New().String()) + } + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + for _, ss := range s { + p.KeepValue(ss) + } + } +} diff --git a/pkg/parquetquery/predicates.go b/pkg/parquetquery/predicates.go index 0acd8d403c3..f5b78edd556 100644 --- a/pkg/parquetquery/predicates.go +++ b/pkg/parquetquery/predicates.go @@ -82,21 +82,33 @@ func (p *StringInPredicate) KeepPage(page pq.Page) bool { return p.helper.keepPage(page, p.KeepValue) } -// RegexInPredicate checks for match against any of the given regexs. -// Memoized and resets on each row group. -type RegexInPredicate struct { - regs []*regexp.Regexp - matches map[string]bool +type regexPredicate struct { + regs []*regexp.Regexp + matches map[string]bool + shouldMatch bool helper DictionaryPredicateHelper } -var _ Predicate = (*RegexInPredicate)(nil) +var _ Predicate = (*regexPredicate)(nil) + +// NewRegexInPredicate checks for match against any of the given regexs. +// Memoized and resets on each row group. +func NewRegexInPredicate(regs []string) (Predicate, error) { + return newRegexPredicate(regs, true) +} + +// NewRegexNotInPredicate checks for values that not match against any of the given regexs. +// Memoized and resets on each row group. +func NewRegexNotInPredicate(regs []string) (Predicate, error) { + return newRegexPredicate(regs, false) +} -func NewRegexInPredicate(regs []string) (*RegexInPredicate, error) { - p := &RegexInPredicate{ - regs: make([]*regexp.Regexp, 0, len(regs)), - matches: make(map[string]bool), +func newRegexPredicate(regs []string, shouldMatch bool) (Predicate, error) { + p := ®exPredicate{ + regs: make([]*regexp.Regexp, 0, len(regs)), + matches: make(map[string]bool), + shouldMatch: shouldMatch, } for _, reg := range regs { r, err := regexp.Compile(reg) @@ -108,7 +120,7 @@ func NewRegexInPredicate(regs []string) (*RegexInPredicate, error) { return p, nil } -func (p *RegexInPredicate) String() string { +func (p *regexPredicate) String() string { var strings string for _, s := range p.regs { strings += fmt.Sprintf("%s, ", s.String()) @@ -116,9 +128,8 @@ func (p *RegexInPredicate) String() string { return fmt.Sprintf("RegexInPredicate{%s}", strings) } -func (p *RegexInPredicate) keep(v *pq.Value) bool { +func (p *regexPredicate) keep(v *pq.Value) bool { if v.IsNull() { - // Null return false } @@ -129,7 +140,7 @@ func (p *RegexInPredicate) keep(v *pq.Value) bool { matched := false for _, r := range p.regs { - if r.MatchString(s) { + if r.MatchString(s) == p.shouldMatch { matched = true break } @@ -139,7 +150,7 @@ func (p *RegexInPredicate) keep(v *pq.Value) bool { return matched } -func (p *RegexInPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool { +func (p *regexPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool { p.helper.setNewRowGroup() // Reset match cache on each row group change @@ -149,11 +160,11 @@ func (p *RegexInPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool { return true } -func (p *RegexInPredicate) KeepValue(v pq.Value) bool { +func (p *regexPredicate) KeepValue(v pq.Value) bool { return p.keep(&v) } -func (p *RegexInPredicate) KeepPage(page pq.Page) bool { +func (p *regexPredicate) KeepPage(page pq.Page) bool { return p.helper.keepPage(page, p.KeepValue) } diff --git a/pkg/traceql/ast_validate.go b/pkg/traceql/ast_validate.go index cf5d9d3cc47..30dff1c4880 100644 --- a/pkg/traceql/ast_validate.go +++ b/pkg/traceql/ast_validate.go @@ -178,8 +178,7 @@ func (o BinaryOperation) validate() error { } switch o.Op { - case OpNotRegex, - OpSpansetChild, + case OpSpansetChild, OpSpansetDescendant, OpSpansetSibling: return newUnsupportedError(fmt.Sprintf("binary operation (%v)", o.Op)) diff --git a/pkg/traceql/test_examples.yaml b/pkg/traceql/test_examples.yaml index 6385b8d3e93..cf428e70435 100644 --- a/pkg/traceql/test_examples.yaml +++ b/pkg/traceql/test_examples.yaml @@ -13,6 +13,7 @@ valid: - '{ 1 <= 2 }' - '{ -1 = 2 }' - '{ "test" =~ "test" }' + - '{ "test" !~ "test" }' - '{ "test" = "test" }' - '{ "test" != "test" }' - '{ .a }' @@ -27,6 +28,7 @@ valid: - '{ .a <= 2 }' - '{ -.a = 2 }' - '{ .a =~ "test" }' + - '{ .a !~ "test" }' - '{ .a = "test" }' - '{ .a != "test" }' - '{ resource.a != 3 }' @@ -159,6 +161,7 @@ validate_fails: - '{ 1 > ok }' - '{ 1 = name }' - '{ 1 =~ 2}' + - '{ 1 !~ 2}' - '{ 1 && "foo" }' - '{ 1 || ok }' - '{ true || 1.1 }' @@ -228,9 +231,6 @@ unsupported: - '{ !parent = nil }' # nil - will be valid when supported - '{ .foo = nil }' - # binary operations - will be valid when supported - - '{ "test" !~ "test" }' - - '{ .a !~ "test" }' # childCount - will be valid when supported - '{ 1 = childCount }' # childCount - will be invalid when supported diff --git a/tempodb/encoding/vparquet/block_traceql.go b/tempodb/encoding/vparquet/block_traceql.go index 255ab26393b..ce620dbc090 100644 --- a/tempodb/encoding/vparquet/block_traceql.go +++ b/tempodb/encoding/vparquet/block_traceql.go @@ -195,7 +195,7 @@ func checkConditions(conditions []traceql.Condition) error { case traceql.OpEqual, traceql.OpNotEqual, traceql.OpGreater, traceql.OpGreaterEqual, traceql.OpLess, traceql.OpLessEqual, - traceql.OpRegex: + traceql.OpRegex, traceql.OpNotRegex: if opCount != 1 { return fmt.Errorf("operation %v must have exactly 1 argument. condition: %+v", cond.Op, cond) } @@ -837,6 +837,8 @@ func createStringPredicate(op traceql.Operator, operands traceql.Operands) (parq case traceql.OpRegex: return parquetquery.NewRegexInPredicate([]string{s}) + case traceql.OpNotRegex: + return parquetquery.NewRegexNotInPredicate([]string{s}) case traceql.OpEqual: return parquetquery.NewStringInPredicate([]string{s}), nil case traceql.OpGreater: diff --git a/tempodb/encoding/vparquet/block_traceql_test.go b/tempodb/encoding/vparquet/block_traceql_test.go index 399d7e35900..ff6cf89b9f1 100644 --- a/tempodb/encoding/vparquet/block_traceql_test.go +++ b/tempodb/encoding/vparquet/block_traceql_test.go @@ -138,6 +138,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) { traceql.MustExtractFetchSpansRequest(`{.foo = "def"}`), // String == traceql.MustExtractFetchSpansRequest(`{.foo != "deg"}`), // String != traceql.MustExtractFetchSpansRequest(`{.foo =~ "d.*"}`), // String Regex + traceql.MustExtractFetchSpansRequest(`{.foo !~ "x.*"}`), // String Not Regex traceql.MustExtractFetchSpansRequest(`{resource.foo = "abc"}`), // Resource-level only traceql.MustExtractFetchSpansRequest(`{span.foo = "def"}`), // Span-level only traceql.MustExtractFetchSpansRequest(`{.foo}`), // Projection only @@ -222,6 +223,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) { // TODO - Should the below query return data or not? It does match the resource // makeReq(parse(t, `{.foo = "abc"}`)), // This should not return results because the span has overridden this attribute to "def". traceql.MustExtractFetchSpansRequest(`{.foo =~ "xyz.*"}`), // Regex IN + traceql.MustExtractFetchSpansRequest(`{.foo !~ ".*"}`), // Regex IN traceql.MustExtractFetchSpansRequest(`{span.bool = true}`), // Bool not match traceql.MustExtractFetchSpansRequest(`{` + LabelDuration + ` > 100s}`), // Intrinsic: duration traceql.MustExtractFetchSpansRequest(`{` + LabelStatus + ` = ok}`), // Intrinsic: status diff --git a/tempodb/encoding/vparquet2/block_traceql.go b/tempodb/encoding/vparquet2/block_traceql.go index 5b4d63fab1a..0172ef7750f 100644 --- a/tempodb/encoding/vparquet2/block_traceql.go +++ b/tempodb/encoding/vparquet2/block_traceql.go @@ -196,7 +196,7 @@ func checkConditions(conditions []traceql.Condition) error { case traceql.OpEqual, traceql.OpNotEqual, traceql.OpGreater, traceql.OpGreaterEqual, traceql.OpLess, traceql.OpLessEqual, - traceql.OpRegex: + traceql.OpRegex, traceql.OpNotRegex: if opCount != 1 { return fmt.Errorf("operation %v must have exactly 1 argument. condition: %+v", cond.Op, cond) } @@ -825,6 +825,8 @@ func createStringPredicate(op traceql.Operator, operands traceql.Operands) (parq case traceql.OpRegex: return parquetquery.NewRegexInPredicate([]string{s}) + case traceql.OpNotRegex: + return parquetquery.NewRegexNotInPredicate([]string{s}) case traceql.OpEqual: return parquetquery.NewStringInPredicate([]string{s}), nil diff --git a/tempodb/encoding/vparquet2/block_traceql_test.go b/tempodb/encoding/vparquet2/block_traceql_test.go index b4e9707928d..0fbb58b5d10 100644 --- a/tempodb/encoding/vparquet2/block_traceql_test.go +++ b/tempodb/encoding/vparquet2/block_traceql_test.go @@ -139,6 +139,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) { traceql.MustExtractFetchSpansRequest(`{.foo = "def"}`), // String == traceql.MustExtractFetchSpansRequest(`{.foo != "deg"}`), // String != traceql.MustExtractFetchSpansRequest(`{.foo =~ "d.*"}`), // String Regex + traceql.MustExtractFetchSpansRequest(`{.foo !~ "x.*"}`), // String Not Regex traceql.MustExtractFetchSpansRequest(`{resource.foo = "abc"}`), // Resource-level only traceql.MustExtractFetchSpansRequest(`{span.foo = "def"}`), // Span-level only traceql.MustExtractFetchSpansRequest(`{.foo}`), // Projection only @@ -223,6 +224,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) { // TODO - Should the below query return data or not? It does match the resource // makeReq(parse(t, `{.foo = "abc"}`)), // This should not return results because the span has overridden this attribute to "def". traceql.MustExtractFetchSpansRequest(`{.foo =~ "xyz.*"}`), // Regex IN + traceql.MustExtractFetchSpansRequest(`{.foo !~ ".*"}`), // String Not Regex traceql.MustExtractFetchSpansRequest(`{span.bool = true}`), // Bool not match traceql.MustExtractFetchSpansRequest(`{` + LabelDuration + ` > 100s}`), // Intrinsic: duration traceql.MustExtractFetchSpansRequest(`{` + LabelStatus + ` = ok}`), // Intrinsic: status