diff --git a/index_test.go b/index_test.go index 05401215f..5aeec11ff 100644 --- a/index_test.go +++ b/index_test.go @@ -3449,6 +3449,7 @@ func TestSearchTypeLanguage(t *testing.T) { Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")}, Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)}, Document{Name: "hello.h", Content: []byte(`#include `)}, + Document{Name: "be.magik", Content: []byte(`_package unicorn`)}, ) t.Log(b.languageMap) @@ -3486,6 +3487,9 @@ func TestSearchTypeLanguage(t *testing.T) { res = searchForTest(t, b, &query.Language{Language: "C"}) wantSingleMatch(res, "hello.h") + res = searchForTest(t, b, &query.Language{Language: "Magik"}) + wantSingleMatch(res, "be.magik") + // test fallback language search by pretending it's an older index version res = searchForTest(t, b, &query.Language{Language: "C++"}) if len(res.Files) != 0 { diff --git a/indexbuilder.go b/indexbuilder.go index 6a7b5ea1a..026fd7e8d 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -27,7 +27,7 @@ import ( "time" "unicode/utf8" - "github.com/go-enry/go-enry/v2" + "github.com/sourcegraph/zoekt/internal/languages" ) var _ = log.Println @@ -397,7 +397,7 @@ func (b *IndexBuilder) addSymbols(symbols []*Symbol) { func DetermineLanguageIfUnknown(doc *Document) { if doc.Language == "" { - doc.Language = enry.GetLanguage(doc.Name, doc.Content) + doc.Language = languages.GetLanguage(doc.Name, doc.Content) } } diff --git a/internal/languages/language.go b/internal/languages/language.go new file mode 100644 index 000000000..7f1110757 --- /dev/null +++ b/internal/languages/language.go @@ -0,0 +1,66 @@ +// This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages. +// go-enry is based off of a package called Linguist (https://github.com/github/linguist) +// and sometimes programming languages may not be supported by Linguist +// or may take a while to get merged in and make it into go-enry. This wrapper +// gives us flexibility to support languages in those cases. We list additional languages +// in this file and remove them once they make it into Linguist and go-enry. +// This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future +// we plan to refactor both into a common library to share between the two repos. +package languages + +import ( + "path/filepath" + "strings" + + "github.com/go-enry/go-enry/v2" +) + +var unsupportedByLinguistAliasMap = map[string]string{ + // Pkl Configuration Language (https://pkl-lang.org/) + // Add to linguist on 6/7/24 + // can remove once go-enry package updates + // to that linguist version + "pkl": "Pkl", + // Magik Language + "magik": "Magik", +} + +var unsupportedByLinguistExtensionToNameMap = map[string]string{ + // Pkl Configuration Language (https://pkl-lang.org/) + ".pkl": "Pkl", + // Magik Language + ".magik": "Magik", +} + +// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias +// It supports languages that are missing in linguist +func GetLanguageByAlias(alias string) (language string, ok bool) { + language, ok = enry.GetLanguageByAlias(alias) + if !ok { + normalizedAlias := strings.ToLower(alias) + language, ok = unsupportedByLinguistAliasMap[normalizedAlias] + } + + return +} + +// GetLanguage is a replacement for enry.GetLanguage +// to find out the most probable language to return but includes support +// for languages missing from linguist +func GetLanguage(filename string, content []byte) (language string) { + language = enry.GetLanguage(filename, content) + + // If go-enry failed to find language, fall back on our + // internal check for languages missing in linguist + if language == "" { + ext := filepath.Ext(filename) + normalizedExt := strings.ToLower(ext) + if ext == "" { + return + } + if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok { + language = lang + } + } + return +} diff --git a/internal/languages/language_test.go b/internal/languages/language_test.go new file mode 100644 index 000000000..294c67776 --- /dev/null +++ b/internal/languages/language_test.go @@ -0,0 +1,95 @@ +package languages + +import "testing" + +func TestGetLanguageByAlias(t *testing.T) { + tests := []struct { + name string + alias string + want string + wantOk bool + }{ + { + name: "empty alias", + alias: "", + want: "", + wantOk: false, + }, + { + name: "unknown alias", + alias: "unknown", + want: "", + wantOk: false, + }, + { + name: "supported alias", + alias: "go", + want: "Go", + wantOk: true, + }, + { + name: "unsupported by linguist alias", + alias: "magik", + want: "Magik", + wantOk: true, + }, + { + name: "unsupported by linguist alias normalized", + alias: "mAgIk", + want: "Magik", + wantOk: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, ok := GetLanguageByAlias(tt.alias) + if got != tt.want || ok != tt.wantOk { + t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk) + } + }) + } +} + +func TestGetLanguage(t *testing.T) { + tests := []struct { + name string + filename string + content []byte + want string + }{ + { + name: "empty filename", + filename: "", + content: []byte(""), + want: "", + }, + { + name: "unknown extension", + filename: "file.unknown", + content: []byte(""), + want: "", + }, + { + name: "supported extension", + filename: "file.go", + content: []byte("package main"), + want: "Go", + }, + { + name: "unsupported by linguist extension", + filename: "file.magik", + content: []byte(""), + want: "Magik", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := GetLanguage(tt.filename, tt.content) + if got != tt.want { + t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want) + } + }) + } +} diff --git a/query/parse.go b/query/parse.go index 028017692..d8762f191 100644 --- a/query/parse.go +++ b/query/parse.go @@ -20,8 +20,8 @@ import ( "log" "regexp/syntax" - "github.com/go-enry/go-enry/v2" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/languages" ) var _ = log.Printf @@ -172,7 +172,7 @@ func parseExpr(in []byte) (Q, int, error) { } expr = q case tokLang: - canonical, ok := enry.GetLanguageByAlias(text) + canonical, ok := languages.GetLanguageByAlias(text) if !ok { expr = &Const{false} } else {