Skip to content

Commit

Permalink
Add support for Magik and PKl languages that are not handled by Lingu…
Browse files Browse the repository at this point in the history
…ist (#790)

Add fallbacks for languages not supported yet by linguist or go-enry
  • Loading branch information
mmanela authored Jun 11, 2024
1 parent 376af3a commit c21df41
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 4 deletions.
4 changes: 4 additions & 0 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3449,6 +3449,7 @@ func TestSearchTypeLanguage(t *testing.T) {
Document{Name: "apex.cls", Content: []byte("public class Car extends Vehicle {")},
Document{Name: "tex.cls", Content: []byte(`\DeclareOption*{`)},
Document{Name: "hello.h", Content: []byte(`#include <stdio.h>`)},
Document{Name: "be.magik", Content: []byte(`_package unicorn`)},
)

t.Log(b.languageMap)
Expand Down Expand Up @@ -3486,6 +3487,9 @@ func TestSearchTypeLanguage(t *testing.T) {
res = searchForTest(t, b, &query.Language{Language: "C"})
wantSingleMatch(res, "hello.h")

res = searchForTest(t, b, &query.Language{Language: "Magik"})
wantSingleMatch(res, "be.magik")

// test fallback language search by pretending it's an older index version
res = searchForTest(t, b, &query.Language{Language: "C++"})
if len(res.Files) != 0 {
Expand Down
4 changes: 2 additions & 2 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
"time"
"unicode/utf8"

"github.com/go-enry/go-enry/v2"
"github.com/sourcegraph/zoekt/internal/languages"
)

var _ = log.Println
Expand Down Expand Up @@ -397,7 +397,7 @@ func (b *IndexBuilder) addSymbols(symbols []*Symbol) {

func DetermineLanguageIfUnknown(doc *Document) {
if doc.Language == "" {
doc.Language = enry.GetLanguage(doc.Name, doc.Content)
doc.Language = languages.GetLanguage(doc.Name, doc.Content)
}
}

Expand Down
66 changes: 66 additions & 0 deletions internal/languages/language.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// This file wraps the logic of go-enry (https://github.com/go-enry/go-enry) to support additional languages.
// go-enry is based off of a package called Linguist (https://github.com/github/linguist)
// and sometimes programming languages may not be supported by Linguist
// or may take a while to get merged in and make it into go-enry. This wrapper
// gives us flexibility to support languages in those cases. We list additional languages
// in this file and remove them once they make it into Linguist and go-enry.
// This logic is similar to what we have in the sourcegraph/sourcegraph repo, in the future
// we plan to refactor both into a common library to share between the two repos.
package languages

import (
"path/filepath"
"strings"

"github.com/go-enry/go-enry/v2"
)

var unsupportedByLinguistAliasMap = map[string]string{
// Pkl Configuration Language (https://pkl-lang.org/)
// Add to linguist on 6/7/24
// can remove once go-enry package updates
// to that linguist version
"pkl": "Pkl",
// Magik Language
"magik": "Magik",
}

var unsupportedByLinguistExtensionToNameMap = map[string]string{
// Pkl Configuration Language (https://pkl-lang.org/)
".pkl": "Pkl",
// Magik Language
".magik": "Magik",
}

// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias
// It supports languages that are missing in linguist
func GetLanguageByAlias(alias string) (language string, ok bool) {
language, ok = enry.GetLanguageByAlias(alias)
if !ok {
normalizedAlias := strings.ToLower(alias)
language, ok = unsupportedByLinguistAliasMap[normalizedAlias]
}

return
}

// GetLanguage is a replacement for enry.GetLanguage
// to find out the most probable language to return but includes support
// for languages missing from linguist
func GetLanguage(filename string, content []byte) (language string) {
language = enry.GetLanguage(filename, content)

// If go-enry failed to find language, fall back on our
// internal check for languages missing in linguist
if language == "" {
ext := filepath.Ext(filename)
normalizedExt := strings.ToLower(ext)
if ext == "" {
return
}
if lang, ok := unsupportedByLinguistExtensionToNameMap[normalizedExt]; ok {
language = lang
}
}
return
}
95 changes: 95 additions & 0 deletions internal/languages/language_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package languages

import "testing"

func TestGetLanguageByAlias(t *testing.T) {
tests := []struct {
name string
alias string
want string
wantOk bool
}{
{
name: "empty alias",
alias: "",
want: "",
wantOk: false,
},
{
name: "unknown alias",
alias: "unknown",
want: "",
wantOk: false,
},
{
name: "supported alias",
alias: "go",
want: "Go",
wantOk: true,
},
{
name: "unsupported by linguist alias",
alias: "magik",
want: "Magik",
wantOk: true,
},
{
name: "unsupported by linguist alias normalized",
alias: "mAgIk",
want: "Magik",
wantOk: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, ok := GetLanguageByAlias(tt.alias)
if got != tt.want || ok != tt.wantOk {
t.Errorf("GetLanguageByAlias(%q) = %q, %t, want %q, %t", tt.alias, got, ok, tt.want, tt.wantOk)
}
})
}
}

func TestGetLanguage(t *testing.T) {
tests := []struct {
name string
filename string
content []byte
want string
}{
{
name: "empty filename",
filename: "",
content: []byte(""),
want: "",
},
{
name: "unknown extension",
filename: "file.unknown",
content: []byte(""),
want: "",
},
{
name: "supported extension",
filename: "file.go",
content: []byte("package main"),
want: "Go",
},
{
name: "unsupported by linguist extension",
filename: "file.magik",
content: []byte(""),
want: "Magik",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := GetLanguage(tt.filename, tt.content)
if got != tt.want {
t.Errorf("GetLanguage(%q, %q) = %q, want %q", tt.filename, tt.content, got, tt.want)
}
})
}
}
4 changes: 2 additions & 2 deletions query/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ import (
"log"
"regexp/syntax"

"github.com/go-enry/go-enry/v2"
"github.com/grafana/regexp"
"github.com/sourcegraph/zoekt/internal/languages"
)

var _ = log.Printf
Expand Down Expand Up @@ -172,7 +172,7 @@ func parseExpr(in []byte) (Q, int, error) {
}
expr = q
case tokLang:
canonical, ok := enry.GetLanguageByAlias(text)
canonical, ok := languages.GetLanguageByAlias(text)
if !ok {
expr = &Const{false}
} else {
Expand Down

0 comments on commit c21df41

Please sign in to comment.