From cf05a9b2a5f3ed1c531e91b28ba5ff97950ca147 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Mon, 15 Jul 2024 00:20:21 -0700 Subject: [PATCH] Structural search: fix precise lang filtering (#63791) The feature `search-content-based-lang-detection` was never implemented for structural search. This PR fills in the missing pieces. Fixes #61714 Relates to https://linear.app/sourcegraph/project/make-precise-lang-filtering-default-4f06cfa28567 ## Test plan Two new unit tests checking that the lang filters are now correctly passed to Zoekt. --- cmd/searcher/internal/search/zoekt_search.go | 18 +++++-- .../internal/search/zoekt_search_test.go | 53 +++++++++++++++++++ internal/search/job/jobutil/job_test.go | 4 ++ 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/cmd/searcher/internal/search/zoekt_search.go b/cmd/searcher/internal/search/zoekt_search.go index d3638746807d..81dc64c6ba5a 100644 --- a/cmd/searcher/internal/search/zoekt_search.go +++ b/cmd/searcher/internal/search/zoekt_search.go @@ -22,7 +22,7 @@ import ( "github.com/sourcegraph/sourcegraph/lib/errors" ) -func handleFilePathPatterns(patternInfo *protocol.PatternInfo) (query.Q, error) { +func handleSearchFilters(patternInfo *protocol.PatternInfo) (query.Q, error) { var and []query.Q // Zoekt uses regular expressions for file paths. @@ -42,10 +42,18 @@ func handleFilePathPatterns(patternInfo *protocol.PatternInfo) (query.Q, error) and = append(and, &query.Not{Child: q}) } + for _, lang := range patternInfo.IncludeLangs { + and = append(and, &query.Language{Language: lang}) + } + + for _, lang := range patternInfo.ExcludeLangs { + and = append(and, &query.Not{Child: &query.Language{Language: lang}}) + } + return query.NewAnd(and...), nil } -func buildQuery(pattern string, branchRepos []query.BranchRepos, filePathPatterns query.Q, shortcircuit bool) (query.Q, error) { +func buildQuery(pattern string, branchRepos []query.BranchRepos, filterQuery query.Q, shortcircuit bool) (query.Q, error) { regexString := comby.StructuralPatToRegexpQuery(pattern, shortcircuit) if len(regexString) == 0 { return &query.Const{Value: true}, nil @@ -56,7 +64,7 @@ func buildQuery(pattern string, branchRepos []query.BranchRepos, filePathPattern } return query.NewAnd( &query.BranchesRepos{List: branchRepos}, - filePathPatterns, + filterQuery, &query.Regexp{ Regexp: re, CaseSensitive: true, @@ -87,13 +95,13 @@ func zoektSearch(ctx context.Context, logger log.Logger, client zoekt.Streamer, }).ToSearchOptions(ctx) searchOpts.Whole = true - filePathPatterns, err := handleFilePathPatterns(args) + filterQuery, err := handleSearchFilters(args) if err != nil { return err } t0 := time.Now() - q, err := buildQuery(atom.Value, branchRepos, filePathPatterns, false) + q, err := buildQuery(atom.Value, branchRepos, filterQuery, false) if err != nil { return err } diff --git a/cmd/searcher/internal/search/zoekt_search_test.go b/cmd/searcher/internal/search/zoekt_search_test.go index e201236a1c60..3a379a9fb0f8 100644 --- a/cmd/searcher/internal/search/zoekt_search_test.go +++ b/cmd/searcher/internal/search/zoekt_search_test.go @@ -62,3 +62,56 @@ func Test_zoektSearch(t *testing.T) { ) require.Error(t, err) } + +func TestHandleSearchFilters(t *testing.T) { + tests := []struct { + name string + patternInfo *protocol.PatternInfo + expectedQ query.Q + expectedErr error + }{ + { + name: "Include and exclude paths", + patternInfo: &protocol.PatternInfo{ + IncludePaths: []string{"\\.go", "cmd"}, + ExcludePaths: "vendor/", + IsCaseSensitive: false, + }, + expectedQ: query.NewAnd( + &query.Substring{Pattern: ".go", FileName: true}, + &query.Substring{Pattern: "cmd", FileName: true}, + &query.Not{Child: &query.Substring{Pattern: "vendor/", FileName: true}}, + ), + expectedErr: nil, + }, + { + name: "Include and exclude languages", + patternInfo: &protocol.PatternInfo{ + IncludePaths: []string{"cmd"}, + IncludeLangs: []string{"go", "typescript"}, + ExcludeLangs: []string{"javascript"}, + }, + expectedQ: query.NewAnd( + &query.Substring{Pattern: "cmd", FileName: true}, + &query.Language{Language: "go"}, + &query.Language{Language: "typescript"}, + &query.Not{Child: &query.Language{Language: "javascript"}}, + ), + expectedErr: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + q, err := handleSearchFilters(tt.patternInfo) + + if tt.expectedErr != nil { + require.Error(t, err) + require.Equal(t, tt.expectedErr.Error(), err.Error()) + } else { + require.NoError(t, err) + require.Equal(t, tt.expectedQ, q) + } + }) + } +} diff --git a/internal/search/job/jobutil/job_test.go b/internal/search/job/jobutil/job_test.go index 2cc1714e2234..a3f6a101acf6 100644 --- a/internal/search/job/jobutil/job_test.go +++ b/internal/search/job/jobutil/job_test.go @@ -1515,6 +1515,10 @@ func TestToTextPatternInfo(t *testing.T) { }, { input: `repo:^github\.com/sgtest/sourcegraph-typescript$ file:^README\.md "basic :[_] access :[_]" patterntype:structural`, output: autogold.Expect(`{"Query":{"Value":"\"basic :[_] access :[_]\"","IsNegated":false,"IsRegExp":false},"IsStructuralPat":true,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":["^README\\.md"],"ExcludePaths":"","IncludeLangs":null,"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":true,"PatternMatchesPath":true,"Languages":null}`), + }, { + input: `repo:^github\.com/sgtest/sourcegraph-typescript$ lang:typescript "basic :[_] access :[_]" patterntype:structural`, + feat: search.Features{ContentBasedLangFilters: true}, + output: autogold.Expect(`{"Query":{"Value":"\"basic :[_] access :[_]\"","IsNegated":false,"IsRegExp":false},"IsStructuralPat":true,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":null,"ExcludePaths":"","IncludeLangs":["TypeScript"],"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":true,"PatternMatchesPath":true,"Languages":["typescript"]}`), }, { input: `sgtest lang:magik type:file`, feat: search.Features{ContentBasedLangFilters: true},