Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Symbol search: support content-based lang detection #60626

Merged
merged 6 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions cmd/symbols/internal/api/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ func TestHandler(t *testing.T) {
pathToEntries := map[string][]*ctags.Entry{
"a.js": {
{
Name: "x",
Path: "a.js",
Line: 1, // ctags line numbers are 1-based
Name: "x",
Path: "a.js",
Language: "JavaScript",
Line: 1, // ctags line numbers are 1-based
},
{
Name: "y",
Path: "a.js",
Line: 2,
Name: "y",
Path: "a.js",
Language: "JavaScript",
Line: 2,
},
},
}
Expand Down Expand Up @@ -83,8 +85,8 @@ func TestHandler(t *testing.T) {
GRPCConnectionCache: connectionCache,
}

x := result.Symbol{Name: "x", Path: "a.js", Line: 0, Character: 4}
y := result.Symbol{Name: "y", Path: "a.js", Line: 1, Character: 4}
x := result.Symbol{Name: "x", Path: "a.js", Language: "JavaScript", Line: 0, Character: 4}
y := result.Symbol{Name: "y", Path: "a.js", Language: "JavaScript", Line: 1, Character: 4}

testCases := map[string]struct {
args search.SymbolsParameters
Expand Down Expand Up @@ -130,6 +132,14 @@ func TestHandler(t *testing.T) {
args: search.SymbolsParameters{ExcludePattern: "a.js", IsCaseSensitive: true, First: 10},
expected: nil,
},
"include lang filters": {
args: search.SymbolsParameters{Query: "x", IncludeLangs: []string{"Javascript"}, IsCaseSensitive: true, First: 10},
expected: []result.Symbol{x},
},
"exclude lang filters": {
args: search.SymbolsParameters{Query: "y", ExcludeLangs: []string{"Javascript"}, IsCaseSensitive: true, First: 10},
expected: nil,
},
}

for label, testCase := range testCases {
Expand Down
2 changes: 2 additions & 0 deletions cmd/symbols/internal/api/search_sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ func MakeSqliteSearchFunc(observationCtx *observation.Context, cachedDatabaseWri
attribute.Int("numIncludePatterns", len(args.IncludePatterns)),
attribute.String("includePatterns", strings.Join(args.IncludePatterns, ":")),
attribute.String("excludePattern", args.ExcludePattern),
attribute.String("includeLangs", strings.Join(args.IncludeLangs, ":")),
jtibshirani marked this conversation as resolved.
Show resolved Hide resolved
attribute.String("excludeLangs", strings.Join(args.ExcludeLangs, ":")),
attribute.Int("first", args.First),
attribute.Float64("timeoutSeconds", args.Timeout.Seconds()),
}})
Expand Down
11 changes: 11 additions & 0 deletions cmd/symbols/internal/database/store/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ func makeSearchConditions(args search.SymbolsParameters) []*sqlf.Query {
conditions = append(conditions, makeSearchCondition("path", includePattern, args.IsCaseSensitive))
}

for _, includeLang := range args.IncludeLangs {
conditions = append(conditions, makeLangCondition(includeLang))
}
for _, excludeLang := range args.ExcludeLangs {
conditions = append(conditions, negate(makeLangCondition(excludeLang)))
}

filtered := conditions[:0]
for _, condition := range conditions {
if condition != nil {
Expand Down Expand Up @@ -120,6 +127,10 @@ func makeSearchCondition(column string, regex string, isCaseSensitive bool) *sql
return sqlf.Sprintf(column+" REGEXP %s", regex)
}

func makeLangCondition(lang string) *sqlf.Query {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing to be aware of: the languages stored here are produced by ctags, whereas the lang filters are expected to be normalized languages from go-enry. I looked through universal-ctags --list-languages and they seem to match up well, so I don't expect this to be a problem in practice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strong suggestion: Please let's use different types here EnryLanguage and CtagsLanguage. These kinds of problems come up in edge cases. For example,

  • CUDA is all upper-case in ctags but Cuda in enry.
  • COBOL is uppercase in enry but Cobol in ctags.
  • MatLab in ctags is MATLAB in enry.
  • Protobuf in ctags is Protocol Buffers in enry
  • ctags has both Perl6 and Raku (which are the same language IIUC) whereas enry only has Perl 6
  • ObjectiveC in ctags is Objective-C in enry
  • Asm in ctags is Assembly in enry (there's also Unix Assembly)
  • ctags has Zsh and Sh whereas enry has Shell.

Some of these are fixable with the case conversion below. But some are not.

I'm not expecting this PR to have a perfect conversion function. But it seems conceptually wrong to conflate these two.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, there are indeed more exceptions than I realized. To check I understand your suggestion -- we should have an explicit conversion from go-enry language to ctags language, and update naming to make this distinction clear.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To check I understand your suggestion -- we should have an explicit conversion from go-enry language to ctags language, and update naming to make this distinction clear.

Yes, and specifically for naming, I'm emphasizing that we should have different types (not just variable names) for these two, and a conversion function from one type to another.

We have something similar in the syntax highlighter, albeit more complicated, to translate Sublime grammar names to Tree-sitter language names.

https://sourcegraph.com/github.com/sourcegraph/sourcegraph@8379154d11d98bae47f5a7bb17336485e8b93ecb/-/blob/docker-images/syntax-highlighter/crates/syntax-analysis/src/highlighting.rs?L163-201

Right now, the langInclude and langExclude in various places are just strings instead of being domain-specific types, and that's understandable, but changing that to be a more specific type would be valuable. (However, I'm not suggesting you do that in this PR specifically.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, that matches my understanding -- I will punt on any new types for this PR.

return sqlf.Sprintf("lower(language) = %s", strings.ToLower(lang))
}

// isLiteralEquality returns true if the given regex matches literal strings exactly.
// If so, this function returns true along with the literal search query. If not, this
// function returns false.
Expand Down
1 change: 1 addition & 0 deletions internal/rockskip/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ go_library(
"//internal/database/dbutil",
"//internal/gitserver/gitdomain",
"//internal/search",
"//internal/search/query",
"//internal/search/result",
"//lib/errors",
"@com_github_amit7itz_goset//:goset",
Expand Down
11 changes: 11 additions & 0 deletions internal/rockskip/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/database"
"github.com/sourcegraph/sourcegraph/internal/search"
"github.com/sourcegraph/sourcegraph/internal/search/query"
"github.com/sourcegraph/sourcegraph/internal/search/result"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
Expand Down Expand Up @@ -437,6 +438,16 @@ func convertSearchArgsToSqlQuery(args search.SymbolsParameters) *sqlf.Query {
// ExcludePaths
conjunctOrNils = append(conjunctOrNils, negate(regexMatch(pathConditions, args.ExcludePattern, args.IsCaseSensitive)))

// Rockskip doesn't store the file's language, so we convert the language filters into path filters as a
// best effort approximation. We ignore the search's case-sensitivity, since it doesn't apply to these filters.
varungandhi-src marked this conversation as resolved.
Show resolved Hide resolved
for _, includeLang := range args.IncludeLangs {
conjunctOrNils = append(conjunctOrNils, regexMatch(pathConditions, query.LangToFileRegexp(includeLang), false))
}

for _, excludeLang := range args.ExcludeLangs {
conjunctOrNils = append(conjunctOrNils, negate(regexMatch(pathConditions, query.LangToFileRegexp(excludeLang), false)))
}

// Drop nils
conjuncts := []*sqlf.Query{}
for _, condition := range conjunctOrNils {
Expand Down
15 changes: 13 additions & 2 deletions internal/rockskip/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/sourcegraph/go-ctags"

"github.com/sourcegraph/sourcegraph/cmd/symbols/fetcher"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/database/dbtest"
Expand Down Expand Up @@ -118,7 +119,11 @@ func TestIndex(t *testing.T) {
verifyBlobs := func() {
repo := "somerepo"
commit := getHead()
args := search.SymbolsParameters{Repo: api.RepoName(repo), CommitID: api.CommitID(commit), Query: ""}
args := search.SymbolsParameters{
Repo: api.RepoName(repo),
CommitID: api.CommitID(commit),
Query: "",
IncludeLangs: []string{"Text"}}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought this patch was additive, so I don't quite understand why this part of the test needs an update, since the "Text" string doesn't appear elsewhere in this patch. Is this test going to continue to pass even if []string{"Text"} is omitted/did you add this to make sure we're exercising the 'has language filter' code path?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, I did this too quickly! I just pushed a correction. Here I added a lang filter on "text" to ensure the new codepath in internal/rockskip/search.go is exercised. If we remove that logic, this test will fail.

Overall, the rockskip tests are really lacking. If we make further changes, we will invest in new unit testing suites.

symbols, err := service.Search(context.Background(), args)
fatalIfError(err, "Search")

Expand All @@ -133,7 +138,10 @@ func TestIndex(t *testing.T) {
}
wantPaths := []string{}
for wantPath := range state {
wantPaths = append(wantPaths, wantPath)
// We only want .txt files since we're filtering by lang: text
if strings.Contains(wantPath, ".txt") {
wantPaths = append(wantPaths, wantPath)
}
}
sort.Strings(gotPaths)
sort.Strings(wantPaths)
Expand Down Expand Up @@ -179,6 +187,9 @@ func TestIndex(t *testing.T) {
add("c.txt", "sym1\nsym2")
commit("add another file with 2 symbols")

add("a.java", "sym1\nsym2")
commit("System.out.println(\"hello, world!\"")

add("a.txt", "sym1\nsym2")
commit("add a symbol to a.txt")

Expand Down
22 changes: 17 additions & 5 deletions internal/search/job/jobutil/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func NewFlatJob(searchInputs *search.Inputs, f query.Flat) (job.Job, error) {
if resultTypes.Has(result.TypeSymbol) {
// Create Symbol Search jobs over repo set.
if !skipRepoSubsetSearch {
request, err := toSymbolSearchRequest(f)
request, err := toSymbolSearchRequest(f, searchInputs.Features)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -629,7 +629,7 @@ func mapSlice(values []string, f func(string) string) []string {
return res
}

func toSymbolSearchRequest(f query.Flat) (*searcher.SymbolSearchRequest, error) {
func toSymbolSearchRequest(f query.Flat, feat *search.Features) (*searcher.SymbolSearchRequest, error) {
if f.Pattern != nil && f.Pattern.Negated {
return nil, &query.UnsupportedError{
Msg: "symbol search does not support negation.",
Expand All @@ -640,17 +640,29 @@ func toSymbolSearchRequest(f query.Flat) (*searcher.SymbolSearchRequest, error)
// assumes that a literal pattern is an escaped regular expression.
regexpPattern := f.ToBasic().PatternString()

// Handle file: and -file: filters.
filesInclude, filesExclude := f.IncludeExcludeValues(query.FieldFile)
langInclude, langExclude := f.IncludeExcludeValues(query.FieldLang)

filesInclude = append(filesInclude, mapSlice(langInclude, query.LangToFileRegexp)...)
filesExclude = append(filesExclude, mapSlice(langExclude, query.LangToFileRegexp)...)
// Handle lang: and -lang: filters.
langAliasInclude, langAliasExclude := f.IncludeExcludeValues(query.FieldLang)
var langInclude, langExclude []string
if feat.ContentBasedLangFilters {
langInclude = toLangFilters(langAliasInclude)
langExclude = toLangFilters(langAliasExclude)
} else {
// If the 'search-content-based-lang-detection' feature is disabled, then we convert the filters
// to file path regexes and do not pass any explicit language filters to the backend.
filesInclude = append(filesInclude, mapSlice(langAliasInclude, query.LangToFileRegexp)...)
filesExclude = append(filesExclude, mapSlice(langAliasExclude, query.LangToFileRegexp)...)
}

return &searcher.SymbolSearchRequest{
RegexpPattern: regexpPattern,
IsCaseSensitive: f.IsCaseSensitive(),
IncludePatterns: filesInclude,
ExcludePattern: query.UnionRegExps(filesExclude),
IncludeLangs: langInclude,
ExcludeLangs: langExclude,
}, nil
}

Expand Down
43 changes: 24 additions & 19 deletions internal/search/job/jobutil/job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1059,41 +1059,46 @@ func TestToSymbolSearchRequest(t *testing.T) {
cases := []struct {
input string
output autogold.Value
feat search.Features
wantErr bool
}{{
input: `repo:go-diff patterntype:literal HunkNoChunksize select:symbol file:^README\.md `,
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":["^README\\.md"],"ExcludePattern":""}`),
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":["^README\\.md"],"ExcludePattern":"","IncludeLangs":null,"ExcludeLangs":null}`),
}, {
input: `repo:go-diff patterntype:literal type:symbol HunkNoChunksize select:symbol -file:^README\.md `,
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":null,"ExcludePattern":"^README\\.md"}`),
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":null,"ExcludePattern":"^README\\.md","IncludeLangs":null,"ExcludeLangs":null}`),
}, {
input: `repo:go-diff type:symbol`,
output: autogold.Expect(`{"RegexpPattern":"","IsCaseSensitive":false,"IncludePatterns":null,"ExcludePattern":""}`),
output: autogold.Expect(`{"RegexpPattern":"","IsCaseSensitive":false,"IncludePatterns":null,"ExcludePattern":"","IncludeLangs":null,"ExcludeLangs":null}`),
}, {
input: `type:symbol NOT option`,
output: autogold.Expect("null"),
wantErr: true,
}, {
input: `repo:go-diff type:symbol HunkNoChunksize lang:Julia -lang:R`,
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":["\\.jl$"],"ExcludePattern":"(?:\\.r$)|(?:\\.rd$)|(?:\\.rsx$)|(?:(^|/)\\.Rprofile$)|(?:(^|/)expr-dist$)","IncludeLangs":null,"ExcludeLangs":null}`),
}, {
input: `repo:go-diff type:symbol HunkNoChunksize lang:Julia -lang:R`,
feat: search.Features{ContentBasedLangFilters: true},
output: autogold.Expect(`{"RegexpPattern":"HunkNoChunksize","IsCaseSensitive":false,"IncludePatterns":null,"ExcludePattern":"","IncludeLangs":["Julia"],"ExcludeLangs":["R"]}`),
}}

createRequest := func(input string) (*searcher.SymbolSearchRequest, error) {
plan, err := query.Pipeline(query.Init(input, query.SearchTypeLiteral))
if err != nil {
t.Fatal(err)
}
for _, tc := range cases {
t.Run(tc.input, func(t *testing.T) {
plan, err := query.Pipeline(query.Init(tc.input, query.SearchTypeLiteral))
if err != nil {
t.Fatal(err)
}

b := plan[0]
var pattern *query.Pattern
if p, ok := b.Pattern.(query.Pattern); ok {
pattern = &p
}
b := plan[0]
var pattern *query.Pattern
if p, ok := b.Pattern.(query.Pattern); ok {
pattern = &p
}

f := query.Flat{Parameters: b.Parameters, Pattern: pattern}
return toSymbolSearchRequest(f)
}
f := query.Flat{Parameters: b.Parameters, Pattern: pattern}
r, err := toSymbolSearchRequest(f, &tc.feat)

for _, tc := range cases {
t.Run(tc.input, func(t *testing.T) {
r, err := createRequest(tc.input)
if (err != nil) != tc.wantErr {
t.Fatalf("mismatch error = %v, wantErr %v", err, tc.wantErr)
}
Expand Down
10 changes: 10 additions & 0 deletions internal/search/searcher/symbol_search_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ func searchInRepo(ctx context.Context, gitserverClient gitserver.Client, repoRev
IsRegExp: true,
IncludePatterns: request.IncludePatterns,
ExcludePattern: request.ExcludePattern,
IncludeLangs: request.IncludeLangs,
ExcludeLangs: request.ExcludeLangs,
// Ask for limit + 1 so we can detect whether there are more results than the limit.
First: limit + 1,
})
Expand Down Expand Up @@ -175,6 +177,8 @@ type SymbolSearchRequest struct {
IsCaseSensitive bool
IncludePatterns []string
ExcludePattern string
IncludeLangs []string
ExcludeLangs []string
}

func (r *SymbolSearchRequest) Fields() []attribute.KeyValue {
Expand All @@ -194,5 +198,11 @@ func (r *SymbolSearchRequest) Fields() []attribute.KeyValue {
if r.ExcludePattern != "" {
add(attribute.String("excludePattern", r.ExcludePattern))
}
if len(r.IncludeLangs) > 0 {
add(attribute.StringSlice("includeLangs", r.IncludeLangs))
}
if len(r.ExcludeLangs) > 0 {
add(attribute.StringSlice("excludeLangs", r.ExcludeLangs))
}
return res
}
4 changes: 4 additions & 0 deletions internal/search/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ type SymbolsParameters struct {
// need to match to get included in the result
ExcludePattern string

// IncludeLangs and ExcludeLangs hold the language filters to apply.
IncludeLangs []string
ExcludeLangs []string

// First indicates that only the first n symbols should be returned.
First int

Expand Down
4 changes: 4 additions & 0 deletions internal/symbols/v1/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ func (x *SearchRequest) FromInternal(p *search.SymbolsParameters) {
IsCaseSensitive: p.IsCaseSensitive,
IncludePatterns: p.IncludePatterns,
ExcludePattern: p.ExcludePattern,
IncludeLangs: p.IncludeLangs,
ExcludeLangs: p.ExcludeLangs,

First: int32(p.First),
Timeout: durationpb.New(p.Timeout),
Expand All @@ -34,6 +36,8 @@ func (x *SearchRequest) ToInternal() search.SymbolsParameters {
IsCaseSensitive: x.GetIsCaseSensitive(),
IncludePatterns: x.GetIncludePatterns(),
ExcludePattern: x.GetExcludePattern(),
IncludeLangs: x.GetIncludeLangs(),
ExcludeLangs: x.GetExcludeLangs(),
First: int(x.GetFirst()),
Timeout: x.GetTimeout().AsDuration(),
}
Expand Down
Loading
Loading