Skip to content

Commit

Permalink
dampen repetition-boost with log2
Browse files Browse the repository at this point in the history
I sometimes notice very poor quality documents getting boosted on common
terms due to them containing lots of results. This factor feels like it
should work more as a tie-breaker, than overriding all other factors.

Test Plan: searching for "class user" only had code in the top results.
  • Loading branch information
keegancsmith committed Oct 18, 2023
1 parent f17ff0b commit 8820c2c
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"log"
"math"
"math/bits"
"regexp/syntax"
"sort"
"strings"
Expand Down Expand Up @@ -415,7 +416,7 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
}

maxFileScore := 0.0
repetitions := 0
repetitions := uint(0)
for i := range fileMatch.LineMatches {
if maxFileScore < fileMatch.LineMatches[i].Score {
maxFileScore = fileMatch.LineMatches[i].Score
Expand All @@ -442,8 +443,10 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
// the matches.
fileMatch.addScore("fragment", maxFileScore, opts.DebugScore)

// Prefer docs with several top-scored matches.
fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(repetitions), opts.DebugScore)
// Prefer docs with several top-scored matches. We use log_2 (bits.Len) to
// prevent the repetitions overriding other factors. In this way it acts
// more like a tie break.
fileMatch.addScore("repetition-boost", scoreRepetitionFactor*float64(bits.Len(repetitions)), opts.DebugScore)

if opts.UseDocumentRanks && len(d.ranks) > int(doc) {
weight := scoreFileRankFactor
Expand Down

0 comments on commit 8820c2c

Please sign in to comment.