From 9926a330af01409c041e4d5a614c3fd78f4d220a Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Tue, 7 May 2024 10:05:37 -0700 Subject: [PATCH] Rename UseKeywordScoring to mention BM25 --- api.go | 10 +++++----- api_proto.go | 4 ++-- build/scoring_test.go | 16 ++++++++-------- eval.go | 2 +- grpc/protos/zoekt/webserver/v1/webserver.pb.go | 8 ++++---- grpc/protos/zoekt/webserver/v1/webserver.proto | 4 ++-- score.go | 8 ++++---- shards/shards_test.go | 4 ++-- 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/api.go b/api.go index 6d2497d1..192b6a83 100644 --- a/api.go +++ b/api.go @@ -946,10 +946,10 @@ type SearchOptions struct { // will be used. This option is temporary and is only exposed for testing/ tuning purposes. DocumentRanksWeight float64 - // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula. - // Currently, this treats each match in a file as a term and computes an approximation to BM25. - // When enabled, all other scoring signals are ignored, including document ranks. - UseKeywordScoring bool + // EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula. + // The scoring algorithm treats each match in a file as a term and computes an approximation to + // BM25. When enabled, all other scoring signals are ignored, including document ranks. + UseBM25Scoring bool // Trace turns on opentracing for this request if true and if the Jaeger address was provided as // a command-line flag @@ -1015,7 +1015,7 @@ func (s *SearchOptions) String() string { addBool("Whole", s.Whole) addBool("ChunkMatches", s.ChunkMatches) addBool("UseDocumentRanks", s.UseDocumentRanks) - addBool("UseKeywordScoring", s.UseKeywordScoring) + addBool("UseBM25Scoring", s.UseBM25Scoring) addBool("Trace", s.Trace) addBool("DebugScore", s.DebugScore) diff --git a/api_proto.go b/api_proto.go index 7e17b8ca..4b55f87f 100644 --- a/api_proto.go +++ b/api_proto.go @@ -700,7 +700,7 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions { DocumentRanksWeight: p.GetDocumentRanksWeight(), Trace: p.GetTrace(), DebugScore: p.GetDebugScore(), - UseKeywordScoring: p.GetUseKeywordScoring(), + UseBM25Scoring: p.GetUseBM25Scoring(), } } @@ -725,6 +725,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions { DocumentRanksWeight: s.DocumentRanksWeight, Trace: s.Trace, DebugScore: s.DebugScore, - UseKeywordScoring: s.UseKeywordScoring, + UseBM25Scoring: s.UseBM25Scoring, } } diff --git a/build/scoring_test.go b/build/scoring_test.go index 9bf243fc..03a13928 100644 --- a/build/scoring_test.go +++ b/build/scoring_test.go @@ -77,7 +77,7 @@ func TestBM25(t *testing.T) { query: &query.Substring{Pattern: "example"}, content: exampleJava, language: "Java", - // keyword-score:1.69 (sum-tf: 7.00, length-ratio: 2.00) + // bm25-score:1.69 (sum-tf: 7.00, length-ratio: 2.00) wantScore: 1.69, }, { // Matches only on content @@ -89,7 +89,7 @@ func TestBM25(t *testing.T) { }}, content: exampleJava, language: "Java", - // keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00) + // bm25-score:5.75 (sum-tf: 56.00, length-ratio: 2.00) wantScore: 5.75, }, { @@ -98,7 +98,7 @@ func TestBM25(t *testing.T) { query: &query.Substring{Pattern: "java"}, content: exampleJava, language: "Java", - // keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00) + // bm25-score:1.07 (sum-tf: 2.00, length-ratio: 2.00) wantScore: 1.07, }, { @@ -106,7 +106,7 @@ func TestBM25(t *testing.T) { fileName: "a/b/c/config.go", query: &query.Substring{Pattern: "config.go"}, language: "Go", - // keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00) + // bm25-score:1.91 (sum-tf: 2.00, length-ratio: 0.00) wantScore: 1.91, }, } @@ -584,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) { } } -func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) { +func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTagsParserType) { skipIfCTagsUnavailable(t, parserType) name := c.language @@ -625,9 +625,9 @@ func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType cta defer ss.Close() srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{ - UseKeywordScoring: keywordScoring, - ChunkMatches: true, - DebugScore: true}) + UseBM25Scoring: useBM25, + ChunkMatches: true, + DebugScore: true}) if err != nil { t.Fatal(err) } diff --git a/eval.go b/eval.go index cc0e61f2..0d8ec91b 100644 --- a/eval.go +++ b/eval.go @@ -317,7 +317,7 @@ nextFileMatch: fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore) } - if opts.UseKeywordScoring { + if opts.UseBM25Scoring { d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts) } else { // Use the standard, non-experimental scoring method by default diff --git a/grpc/protos/zoekt/webserver/v1/webserver.pb.go b/grpc/protos/zoekt/webserver/v1/webserver.pb.go index f7bfb442..db057db7 100644 --- a/grpc/protos/zoekt/webserver/v1/webserver.pb.go +++ b/grpc/protos/zoekt/webserver/v1/webserver.pb.go @@ -388,10 +388,10 @@ type SearchOptions struct { Trace bool `protobuf:"varint,13,opt,name=trace,proto3" json:"trace,omitempty"` // If set, the search results will contain debug information for scoring. DebugScore bool `protobuf:"varint,14,opt,name=debug_score,json=debugScore,proto3" json:"debug_score,omitempty"` - // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula. + // EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula. // Currently, this treats each match in a file as a term and computes an approximation to BM25. // When enabled, all other scoring signals are ignored, including document ranks. - UseKeywordScoring bool `protobuf:"varint,15,opt,name=use_keyword_scoring,json=useKeywordScoring,proto3" json:"use_keyword_scoring,omitempty"` + UseBM25Scoring bool `protobuf:"varint,15,opt,name=use_bm25_scoring,json=useBM25Scoring,proto3" json:"use_bm25_scoring,omitempty"` } func (x *SearchOptions) Reset() { @@ -531,9 +531,9 @@ func (x *SearchOptions) GetDebugScore() bool { return false } -func (x *SearchOptions) GetUseKeywordScoring() bool { +func (x *SearchOptions) GetUseBM25Scoring() bool { if x != nil { - return x.UseKeywordScoring + return x.UseBM25Scoring } return false } diff --git a/grpc/protos/zoekt/webserver/v1/webserver.proto b/grpc/protos/zoekt/webserver/v1/webserver.proto index 981a119d..7ffe7008 100644 --- a/grpc/protos/zoekt/webserver/v1/webserver.proto +++ b/grpc/protos/zoekt/webserver/v1/webserver.proto @@ -107,10 +107,10 @@ message SearchOptions { // If set, the search results will contain debug information for scoring. bool debug_score = 14; - // EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula. + // EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula. // Currently, this treats each match in a file as a term and computes an approximation to BM25. // When enabled, all other scoring signals are ignored, including document ranks. - bool use_keyword_scoring = 15; + bool use_bm25_scoring = 15; } message ListRequest { diff --git a/score.go b/score.go index 115eabd2..dc0f4c19 100644 --- a/score.go +++ b/score.go @@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc m.Score += computed } -func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) { +func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) { if debugScore { - m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) + m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L) } m.Score += score } @@ -116,7 +116,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn } // scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring -// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula +// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula // except inverse document frequency (idf), since we don't have access to global term frequency statistics. // // Filename matches count twice as much as content matches. This mimics a common text search strategy where you @@ -160,5 +160,5 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [ score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf) } - fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore) + fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore) } diff --git a/shards/shards_test.go b/shards/shards_test.go index 5c4ddc73..4a9a3865 100644 --- a/shards/shards_test.go +++ b/shards/shards_test.go @@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) { } } -func TestUseKeywordScoring(t *testing.T) { +func TestUseBM25Scoring(t *testing.T) { b := testIndexBuilder(t, &zoekt.Repository{}, zoekt.Document{Name: "f1", Content: []byte("one two two three")}, @@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) { &query.Substring{Pattern: "three"}) opts := zoekt.SearchOptions{ - UseKeywordScoring: true, + UseBM25Scoring: true, } results, err := ss.Search(context.Background(), q, &opts)