Skip to content

Commit

Permalink
Ranking: include filename matches in bm25
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed Apr 16, 2024
1 parent 59ab949 commit 242305e
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 73 deletions.
134 changes: 79 additions & 55 deletions contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,82 +137,106 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
return byteOff
}

// fillMatches converts the internal candidateMatch slice into our API's LineMatch.
// It only ever returns content XOR filename matches, not both. If there are any
// content matches, these are always returned, and we omit filename matches.
//
// Performance invariant: ms is sorted and non-overlapping.
//
// Note: the byte slices may be backed by mmapped data, so before being
// returned by the API it needs to be copied.
func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
var result []LineMatch
if ms[0].fileName {
score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
var filenameMatches []*candidateMatch
contentMatches := ms[:0]

// There is only "line" in a filename.
res := LineMatch{
Line: p.id.fileName(p.idx),
FileName: true,

Score: score,
DebugScore: debugScore,
for _, m := range ms {
if m.fileName {
filenameMatches = append(filenameMatches, m)
} else {
contentMatches = append(contentMatches, m)
}
}

for _, m := range ms {
res.LineFragments = append(res.LineFragments, LineFragmentMatch{
LineOffset: int(m.byteOffset),
MatchLength: int(m.byteMatchSz),
Offset: m.byteOffset,
})
// If there are any content matches, we only return these and skip filename matches.
if len(contentMatches) > 0 {
contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false))
return p.fillContentMatches(contentMatches, numContextLines, language, debug)
}

result = []LineMatch{res}
}
} else {
ms = breakMatchesOnNewlines(ms, p.data(false))
result = p.fillContentMatches(ms, numContextLines, language, debug)
// Otherwise, we return a single line containing the filematch match.
score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
res := LineMatch{
Line: p.id.fileName(p.idx),
FileName: true,
Score: score,
DebugScore: debugScore,
}

return result
for _, m := range ms {
res.LineFragments = append(res.LineFragments, LineFragmentMatch{
LineOffset: int(m.byteOffset),
MatchLength: int(m.byteMatchSz),
Offset: m.byteOffset,
})
}

return []LineMatch{res}

}

// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch.
// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch.
// It only ever returns content XOR filename matches, not both. If there are any content
// matches, these are always returned, and we omit filename matches.
//
// Performance invariant: ms is sorted and non-overlapping.
//
// Note: the byte slices may be backed by mmapped data, so before being
// returned by the API it needs to be copied.
func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
var result []ChunkMatch
if ms[0].fileName {
// If the first match is a filename match, there will only be
// one match and the matched content will be the filename.

score, debugScore, _ := p.candidateMatchScore(ms, language, debug)
var filenameMatches []*candidateMatch
contentMatches := ms[:0]

fileName := p.id.fileName(p.idx)
ranges := make([]Range, 0, len(ms))
for _, m := range ms {
ranges = append(ranges, Range{
Start: Location{
ByteOffset: m.byteOffset,
LineNumber: 1,
Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
},
End: Location{
ByteOffset: m.byteOffset + m.byteMatchSz,
LineNumber: 1,
Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
},
})
for _, m := range ms {
if m.fileName {
filenameMatches = append(filenameMatches, m)
} else {
contentMatches = append(contentMatches, m)
}
}

result = []ChunkMatch{{
Content: fileName,
ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
Ranges: ranges,
FileName: true,
// If there are any content matches, we only return these and skip filename matches.
if len(contentMatches) > 0 {
return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug)
}

Score: score,
DebugScore: debugScore,
}}
} else {
result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
// Otherwise, we return a single chunk representing the filename match.
score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug)
fileName := p.id.fileName(p.idx)
ranges := make([]Range, 0, len(ms))
for _, m := range ms {
ranges = append(ranges, Range{
Start: Location{
ByteOffset: m.byteOffset,
LineNumber: 1,
Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1),
},
End: Location{
ByteOffset: m.byteOffset + m.byteMatchSz,
LineNumber: 1,
Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1),
},
})
}

return result
return []ChunkMatch{{
Content: fileName,
ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
Ranges: ranges,
FileName: true,

Score: score,
DebugScore: debugScore,
}}
}

func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
Expand Down
39 changes: 22 additions & 17 deletions eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,23 +407,6 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
}
})

// If there are content matches, trim all filename matches.
foundContentMatch := false
for _, c := range cands {
if !c.fileName {
foundContentMatch = true
break
}
}

res := cands[:0]
for _, c := range cands {
if !foundContentMatch || !c.fileName {
res = append(res, c)
}
}
cands = res

// If we found no candidate matches at all, assume there must have been a match on filename.
if len(cands) == 0 {
nm := d.fileName(nextDoc)
Expand All @@ -439,6 +422,7 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
}}
}

res := cands[:0]
if merge {
// Merge adjacent candidates. This guarantees that the matches
// are non-overlapping.
Expand All @@ -450,7 +434,15 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
res = append(res, c)
continue
}

last := res[len(res)-1]

// Never merge filename and content matches
if last.fileName != c.fileName {
res = append(res, c)
continue
}

lastEnd := last.byteOffset + last.byteMatchSz
end := c.byteOffset + c.byteMatchSz
if lastEnd >= c.byteOffset {
Expand Down Expand Up @@ -485,7 +477,15 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT
res = append(res, c)
continue
}

last := res[len(res)-1]

// Never merge filename and content matches
if last.fileName != c.fileName {
res = append(res, c)
continue
}

lastEnd := last.byteOffset + last.byteMatchSz
if lastEnd > c.byteOffset {
continue
Expand All @@ -502,6 +502,11 @@ type sortByOffsetSlice []*candidateMatch
func (m sortByOffsetSlice) Len() int { return len(m) }
func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m sortByOffsetSlice) Less(i, j int) bool {
// Sort all filename matches to the start
if m[i].fileName != m[j].fileName {
return m[i].fileName
}

if m[i].byteOffset == m[j].byteOffset { // tie break if same offset
// Prefer longer candidates if starting at same position
return m[i].byteMatchSz > m[j].byteMatchSz
Expand Down
10 changes: 9 additions & 1 deletion score.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn
// algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula
// except inverse document frequency (idf), since we don't have access to global term frequency statistics.
//
// Filename matches count twice as much as content matches. This mimics a common text search strategy where you
// 'boost' matches on document titles.
//
// This scoring strategy ignores all other signals including document ranks. This keeps things simple for now,
// since BM25 is not normalized and can be tricky to combine with other scoring signals.
func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) {
Expand All @@ -127,7 +130,12 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [
termFreqs := map[string]int{}
for _, cand := range cands {
term := string(cand.substrLowered)
termFreqs[term]++

if cand.fileName {
termFreqs[term] += 2
} else {
termFreqs[term]++
}
}

// Compute the file length ratio. Usually the calculation would be based on terms, but using
Expand Down

0 comments on commit 242305e

Please sign in to comment.