Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename UseKeywordScoring to mention BM25 #778

Merged
merged 2 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,10 +946,10 @@ type SearchOptions struct {
// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
DocumentRanksWeight float64

// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
// When enabled, all other scoring signals are ignored, including document ranks.
UseKeywordScoring bool
// EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
// The scoring algorithm treats each match in a file as a term and computes an approximation to
// BM25. When enabled, all other scoring signals are ignored, including document ranks.
UseBM25Scoring bool
mmanela marked this conversation as resolved.
Show resolved Hide resolved

// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
// a command-line flag
Expand Down Expand Up @@ -1015,7 +1015,7 @@ func (s *SearchOptions) String() string {
addBool("Whole", s.Whole)
addBool("ChunkMatches", s.ChunkMatches)
addBool("UseDocumentRanks", s.UseDocumentRanks)
addBool("UseKeywordScoring", s.UseKeywordScoring)
addBool("UseBM25Scoring", s.UseBM25Scoring)
addBool("Trace", s.Trace)
addBool("DebugScore", s.DebugScore)

Expand Down
4 changes: 2 additions & 2 deletions api_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,7 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
DocumentRanksWeight: p.GetDocumentRanksWeight(),
Trace: p.GetTrace(),
DebugScore: p.GetDebugScore(),
UseKeywordScoring: p.GetUseKeywordScoring(),
UseBM25Scoring: p.GetUseBm25Scoring(),
}
}

Expand All @@ -725,6 +725,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
DocumentRanksWeight: s.DocumentRanksWeight,
Trace: s.Trace,
DebugScore: s.DebugScore,
UseKeywordScoring: s.UseKeywordScoring,
UseBm25Scoring: s.UseBM25Scoring,
}
}
16 changes: 8 additions & 8 deletions build/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "example"},
content: exampleJava,
language: "Java",
// keyword-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
// bm25-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
wantScore: 1.69,
}, {
// Matches only on content
Expand All @@ -89,7 +89,7 @@ func TestBM25(t *testing.T) {
}},
content: exampleJava,
language: "Java",
// keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
// bm25-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
wantScore: 5.75,
},
{
Expand All @@ -98,15 +98,15 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "java"},
content: exampleJava,
language: "Java",
// keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
// bm25-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
wantScore: 1.07,
},
{
// Matches only on filename, and content is missing
fileName: "a/b/c/config.go",
query: &query.Substring{Pattern: "config.go"},
language: "Go",
// keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
// bm25-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
wantScore: 1.91,
},
}
Expand Down Expand Up @@ -584,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) {
}
}

func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) {
func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTagsParserType) {
skipIfCTagsUnavailable(t, parserType)

name := c.language
Expand Down Expand Up @@ -625,9 +625,9 @@ func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType cta
defer ss.Close()

srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{
UseKeywordScoring: keywordScoring,
ChunkMatches: true,
DebugScore: true})
UseBM25Scoring: useBM25,
ChunkMatches: true,
DebugScore: true})
if err != nil {
t.Fatal(err)
}
Expand Down
2 changes: 1 addition & 1 deletion eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ nextFileMatch:
fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
}

if opts.UseKeywordScoring {
if opts.UseBM25Scoring {
d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
} else {
// Use the standard, non-experimental scoring method by default
Expand Down
765 changes: 382 additions & 383 deletions grpc/protos/zoekt/webserver/v1/webserver.pb.go

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions grpc/protos/zoekt/webserver/v1/webserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ message SearchOptions {
// If set, the search results will contain debug information for scoring.
bool debug_score = 14;

// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
// EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula.
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
// When enabled, all other scoring signals are ignored, including document ranks.
bool use_keyword_scoring = 15;
bool use_bm25_scoring = 15;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggilmore is it safe to rename this field in the proto file? Don't know enough about grpc :) I'm assuming yes since the type didn't change.

}

message ListRequest {
Expand Down
8 changes: 4 additions & 4 deletions score.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc
m.Score += computed
}

func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) {
if debugScore {
m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
}
m.Score += score
}
Expand Down Expand Up @@ -116,7 +116,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
}

// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
//
// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
Expand Down Expand Up @@ -160,5 +160,5 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
}

fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore)
}
4 changes: 2 additions & 2 deletions shards/shards_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) {
}
}

func TestUseKeywordScoring(t *testing.T) {
func TestUseBM25Scoring(t *testing.T) {
b := testIndexBuilder(t,
&zoekt.Repository{},
zoekt.Document{Name: "f1", Content: []byte("one two two three")},
Expand All @@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) {
&query.Substring{Pattern: "three"})

opts := zoekt.SearchOptions{
UseKeywordScoring: true,
UseBM25Scoring: true,
}

results, err := ss.Search(context.Background(), q, &opts)
Expand Down
Loading