Skip to content

Commit

Permalink
Rename UseKeywordScoring to mention BM25
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed May 7, 2024
1 parent 9f35cb1 commit 9926a33
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 28 deletions.
10 changes: 5 additions & 5 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,10 +946,10 @@ type SearchOptions struct {
// will be used. This option is temporary and is only exposed for testing/ tuning purposes.
DocumentRanksWeight float64

// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
// When enabled, all other scoring signals are ignored, including document ranks.
UseKeywordScoring bool
// EXPERIMENTAL. If true, use text-search style scoring instead of the default scoring formula.
// The scoring algorithm treats each match in a file as a term and computes an approximation to
// BM25. When enabled, all other scoring signals are ignored, including document ranks.
UseBM25Scoring bool

// Trace turns on opentracing for this request if true and if the Jaeger address was provided as
// a command-line flag
Expand Down Expand Up @@ -1015,7 +1015,7 @@ func (s *SearchOptions) String() string {
addBool("Whole", s.Whole)
addBool("ChunkMatches", s.ChunkMatches)
addBool("UseDocumentRanks", s.UseDocumentRanks)
addBool("UseKeywordScoring", s.UseKeywordScoring)
addBool("UseBM25Scoring", s.UseBM25Scoring)
addBool("Trace", s.Trace)
addBool("DebugScore", s.DebugScore)

Expand Down
4 changes: 2 additions & 2 deletions api_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,7 @@ func SearchOptionsFromProto(p *proto.SearchOptions) *SearchOptions {
DocumentRanksWeight: p.GetDocumentRanksWeight(),
Trace: p.GetTrace(),
DebugScore: p.GetDebugScore(),
UseKeywordScoring: p.GetUseKeywordScoring(),
UseBM25Scoring: p.GetUseBM25Scoring(),
}
}

Expand All @@ -725,6 +725,6 @@ func (s *SearchOptions) ToProto() *proto.SearchOptions {
DocumentRanksWeight: s.DocumentRanksWeight,
Trace: s.Trace,
DebugScore: s.DebugScore,
UseKeywordScoring: s.UseKeywordScoring,
UseBM25Scoring: s.UseBM25Scoring,
}
}
16 changes: 8 additions & 8 deletions build/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "example"},
content: exampleJava,
language: "Java",
// keyword-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
// bm25-score:1.69 (sum-tf: 7.00, length-ratio: 2.00)
wantScore: 1.69,
}, {
// Matches only on content
Expand All @@ -89,7 +89,7 @@ func TestBM25(t *testing.T) {
}},
content: exampleJava,
language: "Java",
// keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
// bm25-score:5.75 (sum-tf: 56.00, length-ratio: 2.00)
wantScore: 5.75,
},
{
Expand All @@ -98,15 +98,15 @@ func TestBM25(t *testing.T) {
query: &query.Substring{Pattern: "java"},
content: exampleJava,
language: "Java",
// keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
// bm25-score:1.07 (sum-tf: 2.00, length-ratio: 2.00)
wantScore: 1.07,
},
{
// Matches only on filename, and content is missing
fileName: "a/b/c/config.go",
query: &query.Substring{Pattern: "config.go"},
language: "Go",
// keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
// bm25-score:1.91 (sum-tf: 2.00, length-ratio: 0.00)
wantScore: 1.91,
},
}
Expand Down Expand Up @@ -584,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) {
}
}

func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) {
func checkScoring(t *testing.T, c scoreCase, useBM25 bool, parserType ctags.CTagsParserType) {
skipIfCTagsUnavailable(t, parserType)

name := c.language
Expand Down Expand Up @@ -625,9 +625,9 @@ func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType cta
defer ss.Close()

srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{
UseKeywordScoring: keywordScoring,
ChunkMatches: true,
DebugScore: true})
UseBM25Scoring: useBM25,
ChunkMatches: true,
DebugScore: true})
if err != nil {
t.Fatal(err)
}
Expand Down
2 changes: 1 addition & 1 deletion eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ nextFileMatch:
fileMatch.LineMatches = cp.fillMatches(finalCands, opts.NumContextLines, fileMatch.Language, opts.DebugScore)
}

if opts.UseKeywordScoring {
if opts.UseBM25Scoring {
d.scoreFileUsingBM25(&fileMatch, nextDoc, finalCands, opts)
} else {
// Use the standard, non-experimental scoring method by default
Expand Down
8 changes: 4 additions & 4 deletions grpc/protos/zoekt/webserver/v1/webserver.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions grpc/protos/zoekt/webserver/v1/webserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ message SearchOptions {
// If set, the search results will contain debug information for scoring.
bool debug_score = 14;

// EXPERIMENTAL. If true, use keyword-style scoring instead of the default scoring formula.
// EXPERIMENTAL. If true, use text search scoring instead of the default scoring formula.
// Currently, this treats each match in a file as a term and computes an approximation to BM25.
// When enabled, all other scoring signals are ignored, including document ranks.
bool use_keyword_scoring = 15;
bool use_bm25_scoring = 15;
}

message ListRequest {
Expand Down
8 changes: 4 additions & 4 deletions score.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ func (m *FileMatch) addScore(what string, computed float64, raw float64, debugSc
m.Score += computed
}

func (m *FileMatch) addKeywordScore(score float64, sumTf float64, L float64, debugScore bool) {
func (m *FileMatch) addBM25Score(score float64, sumTf float64, L float64, debugScore bool) {
if debugScore {
m.Debug += fmt.Sprintf("keyword-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
m.Debug += fmt.Sprintf("bm25-score:%.2f (sum-tf: %.2f, length-ratio: %.2f)", score, sumTf, L)
}
m.Score += score
}
Expand Down Expand Up @@ -116,7 +116,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
}

// scoreFileUsingBM25 computes a score for the file match using an approximation to BM25, the most common scoring
// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
// algorithm for text search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
//
// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
Expand Down Expand Up @@ -160,5 +160,5 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
score += ((k + 1.0) * tf) / (k*(1.0-b+b*L) + tf)
}

fileMatch.addKeywordScore(score, sumTf, L, opts.DebugScore)
fileMatch.addBM25Score(score, sumTf, L, opts.DebugScore)
}
4 changes: 2 additions & 2 deletions shards/shards_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,7 @@ func TestAtomCountScore(t *testing.T) {
}
}

func TestUseKeywordScoring(t *testing.T) {
func TestUseBM25Scoring(t *testing.T) {
b := testIndexBuilder(t,
&zoekt.Repository{},
zoekt.Document{Name: "f1", Content: []byte("one two two three")},
Expand All @@ -1103,7 +1103,7 @@ func TestUseKeywordScoring(t *testing.T) {
&query.Substring{Pattern: "three"})

opts := zoekt.SearchOptions{
UseKeywordScoring: true,
UseBM25Scoring: true,
}

results, err := ss.Search(context.Background(), q, &opts)
Expand Down

0 comments on commit 9926a33

Please sign in to comment.