Ranking: include filename matches in bm25

sourcegraph · Apr 16, 2024 · 242305e · 242305e
1 parent 59ab949
commit 242305e
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 73 deletions.
diff --git a/contentprovider.go b/contentprovider.go
@@ -137,82 +137,106 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 	return byteOff
 }
 
+// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
+// It only ever returns content XOR filename matches, not both. If there are any
+// content matches, these are always returned, and we omit filename matches.
+//
+// Performance invariant: ms is sorted and non-overlapping.
+//
+// Note: the byte slices may be backed by mmapped data, so before being
+// returned by the API it needs to be copied.
 func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
-	var result []LineMatch
-	if ms[0].fileName {
-		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
+	var filenameMatches []*candidateMatch
+	contentMatches := ms[:0]
 
-		// There is only "line" in a filename.
-		res := LineMatch{
-			Line:     p.id.fileName(p.idx),
-			FileName: true,
-
-			Score:      score,
-			DebugScore: debugScore,
+	for _, m := range ms {
+		if m.fileName {
+			filenameMatches = append(filenameMatches, m)
+		} else {
+			contentMatches = append(contentMatches, m)
 		}
+	}
 
-		for _, m := range ms {
-			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
-				LineOffset:  int(m.byteOffset),
-				MatchLength: int(m.byteMatchSz),
-				Offset:      m.byteOffset,
-			})
+	// If there are any content matches, we only return these and skip filename matches.
+	if len(contentMatches) > 0 {
+		contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
+		return p.fillContentMatches(contentMatches, numContextLines, language, debug)
+	}
 
-			result = []LineMatch{res}
-		}
-	} else {
-		ms = breakMatchesOnNewlines(ms, p.data(false))
-		result = p.fillContentMatches(ms, numContextLines, language, debug)
+	// Otherwise, we return a single line containing the filematch match.
+	score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
+	res := LineMatch{
+		Line:       p.id.fileName(p.idx),
+		FileName:   true,
+		Score:      score,
+		DebugScore: debugScore,
 	}
 
-	return result
+	for _, m := range ms {
+		res.LineFragments = append(res.LineFragments, LineFragmentMatch{
+			LineOffset:  int(m.byteOffset),
+			MatchLength: int(m.byteMatchSz),
+			Offset:      m.byteOffset,
+		})
+	}
+
+	return []LineMatch{res}
+
 }
 
-// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
+// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
+// It only ever returns content XOR filename matches, not both. If there are any content
+// matches, these are always returned, and we omit filename matches.
 //
 // Performance invariant: ms is sorted and non-overlapping.
 //
 // Note: the byte slices may be backed by mmapped data, so before being
 // returned by the API it needs to be copied.
 func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
-	var result []ChunkMatch
-	if ms[0].fileName {
-		// If the first match is a filename match, there will only be
-		// one match and the matched content will be the filename.
-
-		score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
+	var filenameMatches []*candidateMatch
+	contentMatches := ms[:0]
 
-		fileName := p.id.fileName(p.idx)
-		ranges := make([]Range, 0, len(ms))
-		for _, m := range ms {
-			ranges = append(ranges, Range{
-				Start: Location{
-					ByteOffset: m.byteOffset,
-					LineNumber: 1,
-					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
-				},
-				End: Location{
-					ByteOffset: m.byteOffset + m.byteMatchSz,
-					LineNumber: 1,
-					Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
-				},
-			})
+	for _, m := range ms {
+		if m.fileName {
+			filenameMatches = append(filenameMatches, m)
+		} else {
+			contentMatches = append(contentMatches, m)
 		}
+	}
 
-		result = []ChunkMatch{{
-			Content:      fileName,
-			ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
-			Ranges:       ranges,
-			FileName:     true,
+	// If there are any content matches, we only return these and skip filename matches.
+	if len(contentMatches) > 0 {
+		return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
+	}
 
-			Score:      score,
-			DebugScore: debugScore,
-		}}
-	} else {
-		result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
+	// Otherwise, we return a single chunk representing the filename match.
+	score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
+	fileName := p.id.fileName(p.idx)
+	ranges := make([]Range, 0, len(ms))
+	for _, m := range ms {
+		ranges = append(ranges, Range{
+			Start: Location{
+				ByteOffset: m.byteOffset,
+				LineNumber: 1,
+				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
+			},
+			End: Location{
+				ByteOffset: m.byteOffset + m.byteMatchSz,
+				LineNumber: 1,
+				Column:     uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
+			},
+		})
 	}
 
-	return result
+	return []ChunkMatch{{
+		Content:      fileName,
+		ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
+		Ranges:       ranges,
+		FileName:     true,
+
+		Score:      score,
+		DebugScore: debugScore,
+	}}
 }
 
 func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {

diff --git a/eval.go b/eval.go
@@ -407,23 +407,6 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
 		}
 	})
 
-	// If there are content matches, trim all filename matches.
-	foundContentMatch := false
-	for _, c := range cands {
-		if !c.fileName {
-			foundContentMatch = true
-			break
-		}
-	}
-
-	res := cands[:0]
-	for _, c := range cands {
-		if !foundContentMatch || !c.fileName {
-			res = append(res, c)
-		}
-	}
-	cands = res
-
 	// If we found no candidate matches at all, assume there must have been a match on filename.
 	if len(cands) == 0 {
 		nm := d.fileName(nextDoc)
@@ -439,6 +422,7 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
 		}}
 	}
 
+	res := cands[:0]
 	if merge {
 		// Merge adjacent candidates. This guarantees that the matches
 		// are non-overlapping.
@@ -450,7 +434,15 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
 				res = append(res, c)
 				continue
 			}
+
 			last := res[len(res)-1]
+
+			// Never merge filename and content matches
+			if last.fileName != c.fileName {
+				res = append(res, c)
+				continue
+			}
+
 			lastEnd := last.byteOffset + last.byteMatchSz
 			end := c.byteOffset + c.byteMatchSz
 			if lastEnd >= c.byteOffset {
@@ -485,7 +477,15 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
 				res = append(res, c)
 				continue
 			}
+
 			last := res[len(res)-1]
+
+			// Never merge filename and content matches
+			if last.fileName != c.fileName {
+				res = append(res, c)
+				continue
+			}
+
 			lastEnd := last.byteOffset + last.byteMatchSz
 			if lastEnd > c.byteOffset {
 				continue
@@ -502,6 +502,11 @@ type sortByOffsetSlice []*candidateMatch
 func (m sortByOffsetSlice) Len() int      { return len(m) }
 func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
 func (m sortByOffsetSlice) Less(i, j int) bool {
+	// Sort all filename matches to the start
+	if m[i].fileName != m[j].fileName {
+		return m[i].fileName
+	}
+
 	if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
 		// Prefer longer candidates if starting at same position
 		return m[i].byteMatchSz > m[j].byteMatchSz

diff --git a/score.go b/score.go
@@ -119,6 +119,9 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
 // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
 // except inverse document frequency (idf), since we don't have access to global term frequency statistics.
 //
+// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
+// 'boost' matches on document titles.
+//
 // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
 // since BM25 is not normalized and can be tricky to combine with other scoring signals.
 func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
@@ -127,7 +130,12 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
 	termFreqs := map[string]int{}
 	for _, cand := range cands {
 		term := string(cand.substrLowered)
-		termFreqs[term]++
+
+		if cand.fileName {
+			termFreqs[term] += 2
+		} else {
+			termFreqs[term]++
+		}
 	}
 
 	// Compute the file length ratio. Usually the calculation would be based on terms, but using