From 9c75cfafe3a7ea53f484899b9453938720c510de Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 13 Jun 2024 14:12:20 +0200 Subject: [PATCH] BM25: Boost file name matches at root With this change we prioritize file name matches at the root of the repository. This is based on the intuition that more important files tend to be closer to the root. We also change the parameter b in the BM25 scoring function from 0.75 to 0.3 to reduce the impact of the document length on the final score. This is based on experiments that showed that our current scoring overly penalizes long but important documents. For example, we consider documents such as a README.md or CHANGELOG at the root of the repository of high quality. However, these documents also tend to be relatively long and are thus penalized. Test plan: Updated unit test --- build/scoring_test.go | 4 ++-- eval.go | 2 +- score.go | 32 ++++++++++++++++++++++++++++---- score_test.go | 2 +- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/build/scoring_test.go b/build/scoring_test.go index 8dc2f2f9..9bdc086c 100644 --- a/build/scoring_test.go +++ b/build/scoring_test.go @@ -106,8 +106,8 @@ func TestBM25(t *testing.T) { fileName: "a/b/c/config.go", query: &query.Substring{Pattern: "config.go"}, language: "Go", - // bm25-score: 0.60 <- sum-termFrequencyScore: 5.00, length-ratio: 0.00 - wantScore: 0.60, + // bm25-score: 0.45 <- sum-termFrequencyScore: 2.00, length-ratio: 0.00 + wantScore: 0.45, }, } diff --git a/eval.go b/eval.go index af637f01..269d4299 100644 --- a/eval.go +++ b/eval.go @@ -330,7 +330,7 @@ nextFileMatch: // document frequencies. Since we don't store document frequencies in the index, // we have to defer the calculation of the final BM25 score to after the whole // shard has been processed. - tf = calculateTermFrequency(finalCands, df) + tf = calculateTermFrequency(&fileMatch, finalCands, df) } else { // Use the standard, non-experimental scoring method by default d.scoreFile(&fileMatch, nextDoc, mt, known, opts) diff --git a/score.go b/score.go index a2579df2..6004538f 100644 --- a/score.go +++ b/score.go @@ -108,18 +108,32 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn } } +func isAtRoot(path string) bool { + return strings.IndexRune(path, '/') == -1 +} + // calculateTermFrequency computes the term frequency for the file match. // // Filename matches count more than content matches. This mimics a common text // search strategy where you 'boost' matches on document titles. -func calculateTermFrequency(cands []*candidateMatch, df termDocumentFrequency) map[string]int { +func calculateTermFrequency(fm *FileMatch, cands []*candidateMatch, df termDocumentFrequency) map[string]int { // Treat each candidate match as a term and compute the frequencies. For now, ignore case // sensitivity and treat filenames and symbols the same as content. termFreqs := map[string]int{} + + boostFileName := 2 + + var evaluated bool for _, cand := range cands { term := string(cand.substrLowered) if cand.fileName { - termFreqs[term] += 5 + if !evaluated { + if isAtRoot(fm.FileName) { + boostFileName = 5 + } + evaluated = true + } + termFreqs[term] += boostFileName } else { termFreqs[term]++ } @@ -155,8 +169,18 @@ type termFrequency struct { // This keeps things simple for now, since BM25 is not normalized and can be // tricky to combine with other scoring signals. func (d *indexData) scoreFilesUsingBM25(fileMatches []FileMatch, tfs []termFrequency, df termDocumentFrequency, opts *SearchOptions) { - // Use standard parameter defaults (used in Lucene and academic papers) - k, b := 1.2, 0.75 + // k determines how quickly the TF score saturates with increasing term + // frequencies and b ∈ [0,1] determines how much the score is down-weighted for + // longer documents. + // + // The standard parameter values, used in Lucene and academic papers, are k=1.2 + // and b=0.75. However, there is some evidence that other values might work + // better depending on the characteristics of the corpus. + // + // In our experiments we found that smaller values of b work well for our use + // case. This means we don't penalize long files as much. b=0.3 is at the lower + // end of the spectrum of values that are reported in the literature. + k, b := 1.2, 0.3 averageFileLength := float64(d.boundaries[d.numDocs()]) / float64(d.numDocs()) // This is very unlikely, but explicitly guard against division by zero. diff --git a/score_test.go b/score_test.go index 2e3b1384..c2fabddf 100644 --- a/score_test.go +++ b/score_test.go @@ -37,7 +37,7 @@ func TestCalculateTermFrequency(t *testing.T) { t.Run("", func(t *testing.T) { fm := FileMatch{} df := make(termDocumentFrequency) - tf := calculateTermFrequency(c.cands, df) + tf := calculateTermFrequency(&fm, c.cands, df) if !maps.Equal(df, c.wantDF) { t.Errorf("got %v, want %v", df, c.wantDF)