From 3d052cbd7029f19b9343b6023b0f12463e45d4ab Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 25 Oct 2023 11:36:32 -0700 Subject: [PATCH 1/2] Ranking: standardize ctags kind names before scoring --- build/e2e_test.go | 64 +++++++++++++++ build/testdata/example.py | 94 +++++++++++++++++++++ build/testdata/example.rb | 77 ++++++++++++++++++ contentprovider.go | 167 ++++++++++++++++++-------------------- ctags/symbol_kind.go | 110 +++++++++++++++++++++++++ 5 files changed, 424 insertions(+), 88 deletions(-) create mode 100644 build/testdata/example.py create mode 100644 build/testdata/example.rb create mode 100644 ctags/symbol_kind.go diff --git a/build/e2e_test.go b/build/e2e_test.go index 94dbf6b9..6a07ae7d 100644 --- a/build/e2e_test.go +++ b/build/e2e_test.go @@ -815,6 +815,16 @@ func TestScoring(t *testing.T) { t.Fatal(err) } + examplePython, err := os.ReadFile("./testdata/example.py") + if err != nil { + t.Fatal(err) + } + + exampleRuby, err := os.ReadFile("./testdata/example.rb") + if err != nil { + t.Fatal(err) + } + exampleScala, err := os.ReadFile("./testdata/example.scala") if err != nil { t.Fatal(err) @@ -1088,6 +1098,60 @@ func Get() { wantScore: 8110, }, // + // Python + // + { + fileName: "example.py", + content: examplePython, + query: &query.Substring{Content: true, Pattern: "C1"}, + wantLanguage: "Python", + // 7000 (symbol) + 1000 (Python class) + 500 (word) + 10 (file order) + wantScore: 8510, + }, + { + fileName: "example.py", + content: examplePython, + query: &query.Substring{Content: true, Pattern: "g"}, + wantLanguage: "Python", + // 7000 (symbol) + 800 (Python function) + 500 (word) + 10 (file order) + wantScore: 8310, + }, + { + fileName: "example.py", + content: examplePython, + query: &query.Substring{Content: true, Pattern: "__init__"}, + wantLanguage: "Python", + // 7000 (symbol) + 400 (Python member) + 50 (partial word) + 10 (file order) + wantScore: 7460, + }, + // + // Ruby + // + { + fileName: "example.rb", + content: exampleRuby, + query: &query.Substring{Content: true, Pattern: "Parental"}, + wantLanguage: "Ruby", + // 7000 (symbol) + 1000 (Ruby class) + 500 (word) + 10 (file order) + wantScore: 8510, + }, + { + fileName: "example.rb", + content: exampleRuby, + query: &query.Substring{Content: true, Pattern: "parental_func"}, + wantLanguage: "Ruby", + // 7000 (symbol) + 900 (Ruby method) + 500 (word) + 10 (file order) + wantScore: 8410, + }, + { + fileName: "example.rb", + content: exampleRuby, + query: &query.Substring{Content: true, Pattern: "MyModule"}, + wantLanguage: "Ruby", + // 7000 (symbol) + 500 (Ruby module) + 500 (word) + 10 (file order) + wantScore: 8210, + }, + // // Scala // { diff --git a/build/testdata/example.py b/build/testdata/example.py new file mode 100644 index 00000000..17bb768e --- /dev/null +++ b/build/testdata/example.py @@ -0,0 +1,94 @@ +# v py.f def +# v py.f.x def +def f(x): + + # v py.f.g def + def g(): + y = 5 + + if True: + # v py.f.x ref + y = x # < "y" py.f.y def + else: + l1 = 3 # < "l1" py.f.l1 def + + # v py.f.i def + for i in range(10): + # v py.f.i ref + l2 = i # < "l2" py.f.l2 def + + while False: + l3 = 3 # < "l3" py.f.l3 def + + try: + l4 = 3 # < "l4" py.f.l4 def + # v py.f.e def + except Exception as e: + l5 = 3 # < "l5" py.f.l5 def + # v py.f.e ref + _ = e + + # vvvv py.f.file def + with open("file.txt") as file: + # vvvv py.f.file fef + print(file) + + # vvv py.f.lam def + # vvv py.f.lam ref + _ = lambda lam: lam + + # v py.f.y ref + # vv py.f.l1 ref + # vv py.f.l2 ref + # vv py.f.l3 ref + # vv py.f.l4 ref + # vv py.f.l5 ref + # v py.f.g ref + _ = y + l1 + l2 + l3 + l4 + l5 + g() + + # vvv recursive.foo ref,nodef + recursive = recursive.foo + + +# vv py.C1 def +class C1: + x = 5 # < "x" py.C1.x def + + def __init__(self, y): + # v py.C1.y def + self.y = y + + def f(self): + # v py.C1.x ref + # v py.C1.g ref + self.x = self.g() + + # v py.C1.g def + def g(self): + # v py.C1.y ref + return self.y + + +class C2(C1): + y = C1() + + def f(self, c1: C1): + c = c1 + # v py.C1.g ref + # v py.C1.x ref + return self.g() + c.x + + +def newC1() -> C1: + return C1() + + +# v py.C1.x ref +_ = newC1().x + +# v py.C1.x ref +# v py.C1.x ref +_ = C1().x + C2().y.x + +if False: + f(3) # < "f" py.f ref diff --git a/build/testdata/example.rb b/build/testdata/example.rb new file mode 100644 index 00000000..8a192a36 --- /dev/null +++ b/build/testdata/example.rb @@ -0,0 +1,77 @@ +SOME_CONSTANT = 2.718 + +if true +a = 1 +elsif false +b = 2 +else +c = 3 +end + +(1..5).each do |counter| +z = 3 +end + +for counter in 1..5 +y = 10 +end + +counter = 1 +while counter <= 5 do +no = true +counter += 1 +end + +begin +raise NoMemoryError, 'Z.' +rescue NoMemoryError => exception_variable +puts 'A', exception_variable +rescue RuntimeError => other_exception_variable +puts 'K' +else +puts 'L' +ensure +puts 'O' +end + +grade = 42 +case grade +when 0.100 +shouldntgetcaptured = true +puts 'you got a grade i guess' +end + +module MyModule +def self.abc(base) +end + +class MyClass +def yay +end + +def self.woo(base) +end +end +end + +class Foo +attr_accessor :bar +attr_reader :baz +attr_writer :qux +end + +class Aliased +def bar +end + +alias_method :baz, :bar +end + +class Parental +def parental_func() +end +end + +class Composed +include Parental +end diff --git a/contentprovider.go b/contentprovider.go index f4db1f71..a4487058 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -21,6 +21,8 @@ import ( "sort" "strings" "unicode/utf8" + + "github.com/sourcegraph/zoekt/ctags" ) var _ = log.Println @@ -556,7 +558,8 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, si = p.id.symbols.data(start + uint32(secIdx)) } if si != nil { - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) + symbolKind := ctags.ParseSymbolKind(si.Kind) + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, symbolKind)) } } @@ -635,7 +638,8 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu } if si != nil { // the LineFragment may not be on a symbol, then si will be nil. - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) + symbolKind := ctags.ParseSymbolKind(si.Kind) + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, symbolKind)) } } @@ -654,102 +658,89 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu // scoreKind boosts a match based on the combination of language and kind. The // language string comes from go-enry, the kind string from ctags. -func scoreKind(language string, kind string) float64 { +func scoreKind(language string, kind ctags.SymbolKind) float64 { var factor float64 // Generic ranking which will be overriden by language specific ranking switch kind { - case "type": // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 + case ctags.Type: // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 factor = 8 - case "class": + case ctags.Class: factor = 10 - case "struct": + case ctags.Struct: factor = 9.5 - case "enum": + case ctags.Enum: factor = 9 - case "interface": + case ctags.Interface: factor = 8 - case "function", "func", "method": + case ctags.Function, ctags.Method: factor = 7 - case "member", "field": + case ctags.Field: factor = 5.5 - case "constant", "const": + case ctags.Constant: factor = 5 - case "var", "variable": + case ctags.Variable: factor = 4 - default: - // No idea what it is, but its something regarded as a symbol + // For all other kinds, assign a low score by default. factor = 1 } - // Refer to universal-ctags --list-kinds-full= to learn about which - // kinds are detected for which language. - // - // Note that go-ctags uses universal-ctags's interactive mode and thus returns - // the full name for "kind" and not the one-letter abbreviation. switch language { case "Java", "java": switch kind { // 2022-03-30: go-ctags contains a regex rule for Java classes that sets "kind" // to "classes" instead of "c". We have to cover both cases to support existing // indexes. - case "class", "classes": + case ctags.Class: factor = 10 - case "enum": + case ctags.Enum: factor = 9 - case "interface": + case ctags.Interface: factor = 8 - case "method": + case ctags.Method: factor = 7 - case "field": + case ctags.Field: factor = 6 - case "enumConstant": + case ctags.EnumConstant: factor = 5 } case "Kotlin", "kotlin": switch kind { - case "class": + case ctags.Class: factor = 10 - case "interface": + case ctags.Interface: factor = 9 - case "method": + case ctags.Method: factor = 8 - case "typealias": + case ctags.TypeAlias: factor = 7 - case "constant": + case ctags.Constant: factor = 6 - case "variable": + case ctags.Variable: factor = 5 } case "Go", "go": switch kind { // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 // for each case a description of the fields in ctags in the comment - case "type": // interface struct talias + case ctags.Type: // interface struct talias factor = 10 - case "method", "function": // methodSpec - factor = 8 - case "variable": // var member - factor = 7 - case "constant": // const - factor = 6 - - case "interface": // interfaces + case ctags.Interface: // interfaces factor = 10 - case "struct": // structs + case ctags.Struct: // structs factor = 9 - case "talias": // type aliases + case ctags.TypeAlias: // type aliases factor = 9 - case "methodSpec": // interface method specification + case ctags.MethodSpec: // interface method specification factor = 8.5 - case "func": // functions + case ctags.Method, ctags.Function: // functions factor = 8 - case "member": // struct members + case ctags.Field: // struct fields factor = 7 - case "const": // constants + case ctags.Constant: // constants factor = 6 - case "var": // variables + case ctags.Variable: // variables factor = 5 } // Could also rank on: @@ -762,21 +753,21 @@ func scoreKind(language string, kind string) float64 { // - unknown unknown case "C++", "c++": switch kind { - case "class": // classes + case ctags.Class: // classes factor = 10 - case "enum": // enumeration names + case ctags.Enum: // enumeration names factor = 9 - case "function": // function definitions + case ctags.Function: // function definitions factor = 8 - case "struct": // structure names + case ctags.Struct: // structure names factor = 7 - case "union": // union names + case ctags.Union: // union names factor = 6 - case "typdef": // typedefs + case ctags.TypeAlias: // typedefs factor = 5 - case "member": // class, struct, and union members + case ctags.Field: // class, struct, and union members factor = 4 - case "variable": // varialbe definitions + case ctags.Variable: // varialbe definitions factor = 3 } // Could also rank on: @@ -788,32 +779,32 @@ func scoreKind(language string, kind string) float64 { // variable variable definitions case "Scala", "scala": switch kind { - case "class": + case ctags.Class: factor = 10 - case "interface": + case ctags.Interface: factor = 9 - case "object": + case ctags.Object: factor = 8 - case "method": + case ctags.Function: factor = 7 - case "type": + case ctags.Type: factor = 6 - case "variable": + case ctags.Variable: factor = 5 - case "package": + case ctags.Package: factor = 4 } case "Python", "python": switch kind { - case "class": // classes + case ctags.Class: // classes factor = 10 - case "function": // function definitions + case ctags.Function: // function definitions factor = 8 - case "member": // class, struct, and union members + case ctags.Field: // class, struct, and union members factor = 4 - case "variable": // variable definitions + case ctags.Variable: // variable definitions factor = 3 - case "local": // local variables + case ctags.Local: // local variables factor = 2 } // Could also rank on: @@ -824,57 +815,57 @@ func scoreKind(language string, kind string) float64 { // - parameter function parameters case "Ruby", "ruby": switch kind { - case "class": + case ctags.Class: factor = 10 - case "method": + case ctags.Method: factor = 9 - case "alias": + case ctags.MethodAlias: factor = 8 - case "module": + case ctags.Module: factor = 7 - case "singletonMethod": + case ctags.SingletonMethod: factor = 6 - case "constant": + case ctags.Constant: factor = 5 - case "accessor": + case ctags.Accessor: factor = 4 - case "library": + case ctags.Library: factor = 3 } case "PHP", "php": switch kind { - case "class": + case ctags.Class: factor = 10 - case "interface": + case ctags.Interface: factor = 9 - case "function": + case ctags.Function: factor = 8 - case "trait": + case ctags.Trait: factor = 7 - case "define": + case ctags.Define: factor = 6 - case "namespace": + case ctags.Namespace: factor = 5 - case "alias": + case ctags.MethodAlias: factor = 4 - case "variable": + case ctags.Variable: factor = 3 - case "local": + case ctags.Local: factor = 3 } case "GraphQL", "graphql": switch kind { - case "type": + case ctags.Type: factor = 10 } case "Markdown", "markdown": // Headers are good signal in docs, but do not rank as highly as code. switch kind { - case "chapter": // # + case ctags.Chapter: // # factor = 4 - case "section": // ## + case ctags.Section: // ## factor = 3 - case "subsection": // ### + case ctags.Subsection: // ### factor = 2 } } diff --git a/ctags/symbol_kind.go b/ctags/symbol_kind.go new file mode 100644 index 00000000..bc020eb7 --- /dev/null +++ b/ctags/symbol_kind.go @@ -0,0 +1,110 @@ +package ctags + +import "strings" + +type SymbolKind uint8 + +const ( + Accessor SymbolKind = iota + Chapter + Class + Constant + Define + Enum + EnumConstant + Field + Function + Interface + Library + Local + Method + MethodAlias + MethodSpec + Module + Namespace + Object + Other + Package + Section + SingletonMethod + Struct + Subsection + Trait + Type + TypeAlias + Union + Variable +) + +// ParseSymbolKind maps the output from different ctags implementations into a +// single set of constants. This is important because universal-ctags and SCIP +// ctags can return different names for the same kind. +// +// To get a sense for which kinds are detected for which language, you can +// refer to universal-ctags --list-kinds-full=. +// +// Note that go-ctags uses universal-ctags's interactive mode and thus returns +// the full name for "kind" and not the one-letter abbreviation. +func ParseSymbolKind(kind string) SymbolKind { + kind = strings.ToLower(kind) + // Generic ranking which will be overriden by language specific ranking + switch kind { + case "accessor", "setter", "getter": // SCIP ctags distinguishes these, but universal-ctags does not + return Accessor + case "chapter": + return Chapter + case "class", "classes": + return Class + case "constant", "const": + return Constant + case "define": + return Define + case "enum": + return Enum + case "enumconstant", "enummember": + return EnumConstant + case "field", "member": + return Field + case "function", "func": + return Function + case "interface": + return Interface + case "local": + return Local + case "method": + return Method + case "methodAlias", "alias": + return MethodAlias + case "methodSpec": + return MethodSpec + case "module": + return Module + case "namespace": + return Namespace + case "object": + return Object + case "package": + return Package + case "section": + return Section + case "singletonmethod": + return SingletonMethod + case "struct": + return Struct + case "subsection": + return Subsection + case "trait": + return Trait + case "type": + return Type + case "typealias", "talias", "typdef": + return TypeAlias + case "union": + return Union + case "var", "variable": + return Variable + default: + return Other + } +} + From 70521884859273d74e07d0ff09c16402f6948ede Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 26 Oct 2023 08:08:37 -0700 Subject: [PATCH 2/2] Capture 'enumerator' as EnumMember, which is common in universal-ctags --- ctags/symbol_kind.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctags/symbol_kind.go b/ctags/symbol_kind.go index bc020eb7..bfe835e7 100644 --- a/ctags/symbol_kind.go +++ b/ctags/symbol_kind.go @@ -61,7 +61,7 @@ func ParseSymbolKind(kind string) SymbolKind { return Define case "enum": return Enum - case "enumconstant", "enummember": + case "enumerator", "enumconstant", "enummember": return EnumConstant case "field", "member": return Field