From 58ceb0565b06b157ee6292baef41ba9f77efd774 Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Fri, 7 Jun 2024 10:21:40 -0400 Subject: [PATCH 1/7] Add support for Magik and PKl languages that are not handled by Linguist --- .vscode/launch.json | 46 ++++++++++++++++++++++++++++++++++++++------- indexbuilder.go | 4 ++-- query/parse.go | 4 ++-- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index dffa3c19e..21d9b370f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,11 +4,43 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ - { - "name": "Attach to Process (from list)", - "type": "go", - "request": "attach", - "mode": "local" - }, + { + "name": "Index folder", + "type": "go", + "request": "launch", + "mode": "auto", + "program": "cmd/zoekt-git-index", + "cwd": "${workspaceFolder}", + "args": ["${input:path}"] + }, + { + "name": "Webserver", + "type": "go", + "request": "launch", + "mode": "auto", + "program": "cmd/zoekt-webserver", + "cwd": "${workspaceFolder}", + "args": ["-index", "${input:indexPath}"] + }, + { + "name": "Attach to Process (from list)", + "type": "go", + "request": "attach", + "mode": "local" + } + ], + "inputs": [ + { + "id": "path", + "description": "Please enter the path to the project to index", + "default": "", + "type": "promptString" + }, + { + "id": "indexPath", + "description": "Enter the path where indexes are stored", + "default": "${userHome}/.zoekt", + "type": "promptString" + } ] -} \ No newline at end of file + } \ No newline at end of file diff --git a/indexbuilder.go b/indexbuilder.go index 6a7b5ea1a..026fd7e8d 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -27,7 +27,7 @@ import ( "time" "unicode/utf8" - "github.com/go-enry/go-enry/v2" + "github.com/sourcegraph/zoekt/internal/languages" ) var _ = log.Println @@ -397,7 +397,7 @@ func (b *IndexBuilder) addSymbols(symbols []*Symbol) { func DetermineLanguageIfUnknown(doc *Document) { if doc.Language == "" { - doc.Language = enry.GetLanguage(doc.Name, doc.Content) + doc.Language = languages.GetLanguage(doc.Name, doc.Content) } } diff --git a/query/parse.go b/query/parse.go index 028017692..d8762f191 100644 --- a/query/parse.go +++ b/query/parse.go @@ -20,8 +20,8 @@ import ( "log" "regexp/syntax" - "github.com/go-enry/go-enry/v2" "github.com/grafana/regexp" + "github.com/sourcegraph/zoekt/internal/languages" ) var _ = log.Printf @@ -172,7 +172,7 @@ func parseExpr(in []byte) (Q, int, error) { } expr = q case tokLang: - canonical, ok := enry.GetLanguageByAlias(text) + canonical, ok := languages.GetLanguageByAlias(text) if !ok { expr = &Const{false} } else { From c3add46a43c9f3c6740cccd450c3603e809207f0 Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Fri, 7 Jun 2024 10:22:05 -0400 Subject: [PATCH 2/7] Try go-enry before falling back --- internal/languages/language.go | 55 +++++++++++++++++ internal/languages/language_test.go | 95 +++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 internal/languages/language.go create mode 100644 internal/languages/language_test.go diff --git a/internal/languages/language.go b/internal/languages/language.go new file mode 100644 index 000000000..d478c6e33 --- /dev/null +++ b/internal/languages/language.go @@ -0,0 +1,55 @@ +package languages + +import ( + "path/filepath" + "strings" + + "github.com/go-enry/go-enry/v2" +) + +var unsupportedByLinguistAliasMap = map[string]string{ + // Pkl Configuration Language (https://pkl-lang.org/) + "pkl": "Pkl", + // Magik Language + "magik": "Magik", +} + +var unsupportedByLinguistExtensionToNameMap = map[string]string{ + // Pkl Configuration Language (https://pkl-lang.org/) + ".pkl": "Pkl", + // Magik Language + ".magik": "Magik", +} + +// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias +// It supports languages that are missing in linguist +func GetLanguageByAlias(alias string) (language string, ok bool) { + language, ok = enry.GetLanguageByAlias(alias) + if !ok { + normalizedAlias := strings.ToLower(alias) + language, ok = unsupportedByLinguistAliasMap[normalizedAlias] + } + + return +} + +// GetLanguage is a replacement for enry.GetLanguage +// to find out the most probable language to return but includes support +// for languages missing from linguist +func GetLanguage(filename string, content []byte) (language string) { + language = enry.GetLanguage(filename, content) + + // If go-enry failed to find language, fall back on our + // internal check for languages missing in linguist + if language == "" { + ext := filepath.Ext(filename) + normalizedExt := strings.ToLower(ext) + if ext == "" { + return + } + if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok { + language = lang + } + } + return +} diff --git a/internal/languages/language_test.go b/internal/languages/language_test.go new file mode 100644 index 000000000..294c67776 --- /dev/null +++ b/internal/languages/language_test.go @@ -0,0 +1,95 @@ +package languages + +import "testing" + +func TestGetLanguageByAlias(t *testing.T) { + tests := []struct { + name string + alias string + want string + wantOk bool + }{ + { + name: "empty alias", + alias: "", + want: "", + wantOk: false, + }, + { + name: "unknown alias", + alias: "unknown", + want: "", + wantOk: false, + }, + { + name: "supported alias", + alias: "go", + want: "Go", + wantOk: true, + }, + { + name: "unsupported by linguist alias", + alias: "magik", + want: "Magik", + wantOk: true, + }, + { + name: "unsupported by linguist alias normalized", + alias: "mAgIk", + want: "Magik", + wantOk: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, ok := GetLanguageByAlias(tt.alias) + if got != tt.want || ok != tt.wantOk { + t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk) + } + }) + } +} + +func TestGetLanguage(t *testing.T) { + tests := []struct { + name string + filename string + content []byte + want string + }{ + { + name: "empty filename", + filename: "", + content: []byte(""), + want: "", + }, + { + name: "unknown extension", + filename: "file.unknown", + content: []byte(""), + want: "", + }, + { + name: "supported extension", + filename: "file.go", + content: []byte("package main"), + want: "Go", + }, + { + name: "unsupported by linguist extension", + filename: "file.magik", + content: []byte(""), + want: "Magik", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := GetLanguage(tt.filename, tt.content) + if got != tt.want { + t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want) + } + }) + } +} From a524bb34141320f81b47e39dd284f1529994239a Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Fri, 7 Jun 2024 10:23:46 -0400 Subject: [PATCH 3/7] Revert file --- .vscode/launch.json | 46 +++++++-------------------------------------- 1 file changed, 7 insertions(+), 39 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 21d9b370f..dffa3c19e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,43 +4,11 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ - { - "name": "Index folder", - "type": "go", - "request": "launch", - "mode": "auto", - "program": "cmd/zoekt-git-index", - "cwd": "${workspaceFolder}", - "args": ["${input:path}"] - }, - { - "name": "Webserver", - "type": "go", - "request": "launch", - "mode": "auto", - "program": "cmd/zoekt-webserver", - "cwd": "${workspaceFolder}", - "args": ["-index", "${input:indexPath}"] - }, - { - "name": "Attach to Process (from list)", - "type": "go", - "request": "attach", - "mode": "local" - } - ], - "inputs": [ - { - "id": "path", - "description": "Please enter the path to the project to index", - "default": "", - "type": "promptString" - }, - { - "id": "indexPath", - "description": "Enter the path where indexes are stored", - "default": "${userHome}/.zoekt", - "type": "promptString" - } + { + "name": "Attach to Process (from list)", + "type": "go", + "request": "attach", + "mode": "local" + }, ] - } \ No newline at end of file +} \ No newline at end of file From 8b5164c940bdbcaa3f52a95f8d2581a587611484 Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Fri, 7 Jun 2024 15:37:00 -0400 Subject: [PATCH 4/7] Add comment on pkl now being in linguist --- internal/languages/language.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/languages/language.go b/internal/languages/language.go index d478c6e33..c59bb56c8 100644 --- a/internal/languages/language.go +++ b/internal/languages/language.go @@ -9,6 +9,9 @@ import ( var unsupportedByLinguistAliasMap = map[string]string{ // Pkl Configuration Language (https://pkl-lang.org/) + // Add to linguist on 6/7/24 + // can remove once go-enry package updates + // to that linguist version "pkl": "Pkl", // Magik Language "magik": "Magik", From 6ee230c1607fd578758ecca801254df6d30d9489 Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Mon, 10 Jun 2024 14:42:53 -0400 Subject: [PATCH 5/7] Add comment --- internal/languages/language.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/languages/language.go b/internal/languages/language.go index c59bb56c8..43ceed264 100644 --- a/internal/languages/language.go +++ b/internal/languages/language.go @@ -1,3 +1,9 @@ +// This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages. +// go-enry is based off of a package called Linguist (https://github.com/github/linguist) +// and sometimes programming languages may not be supported by Linguist +// or may take a while to get merged in and make it into go-enry. This wrapper +// gives us flexibility to support languages in those cases. We list additional languages +// in this file and remove them once they make it into Linguist and go-enry package languages import ( From 60d0ee53e5692d7ac9d2d8d8ea2a1def4abfb42a Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Mon, 10 Jun 2024 14:55:21 -0400 Subject: [PATCH 6/7] Update comment --- internal/languages/language.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/languages/language.go b/internal/languages/language.go index 43ceed264..7f1110757 100644 --- a/internal/languages/language.go +++ b/internal/languages/language.go @@ -3,7 +3,9 @@ // and sometimes programming languages may not be supported by Linguist // or may take a while to get merged in and make it into go-enry. This wrapper // gives us flexibility to support languages in those cases. We list additional languages -// in this file and remove them once they make it into Linguist and go-enry +// in this file and remove them once they make it into Linguist and go-enry. +// This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future +// we plan to refactor both into a common library to share between the two repos. package languages import ( From 615f7a192170bbcfa693141b24c57d2fd71e244f Mon Sep 17 00:00:00 2001 From: Matthew Manela Date: Tue, 11 Jun 2024 12:56:53 -0400 Subject: [PATCH 7/7] new test for resolution code path --- index_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index_test.go b/index_test.go index 05401215f..5aeec11ff 100644 --- a/index_test.go +++ b/index_test.go @@ -3449,6 +3449,7 @@ func TestSearchTypeLanguage(t *testing.T) { Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")}, Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)}, Document{Name: "hello.h", Content: []byte(`#include `)}, + Document{Name: "be.magik", Content: []byte(`_package unicorn`)}, ) t.Log(b.languageMap) @@ -3486,6 +3487,9 @@ func TestSearchTypeLanguage(t *testing.T) { res = searchForTest(t, b, &query.Language{Language: "C"}) wantSingleMatch(res, "hello.h") + res = searchForTest(t, b, &query.Language{Language: "Magik"}) + wantSingleMatch(res, "be.magik") + // test fallback language search by pretending it's an older index version res = searchForTest(t, b, &query.Language{Language: "C++"}) if len(res.Files) != 0 {