Skip to content

Commit

Permalink
score: do scoring on candidateMatches (#723)
Browse files Browse the repository at this point in the history
This is a refactor which removes our duplicated scoring logic for
ChunkMatch vs LineMatch. Instead we score slices of []*candidateMatch.

Other than being a good refactor, candidateMatch is a much more
appropriate structure to stuff in extra information for scoring than our
public APIs. So this enables the work we want to do around atom based
scoring.

The only behaviour change in this commit are two fixes:
- DocumentSection caching would fail if empty since we relied on the
  empty cache to be non-nil. This lead to inflated ContentBytesLoaded.
- Empty FileMatch would still read in DocumentSection, even though it
  wasn't needed.

Test Plan: go test. Our coverage is decent, we have lots of ranking
tests which did not change and we have hardcoded stats of work done
which did not change (except for the above fixes).
  • Loading branch information
keegancsmith authored Jan 22, 2024
1 parent 18b6999 commit b82a981
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 68 deletions.
154 changes: 88 additions & 66 deletions contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,15 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
var result []LineMatch
if ms[0].fileName {
score, debugScore, _ := p.candidateMatchScore(ms, language, debug)

// There is only "line" in a filename.
res := LineMatch{
Line: p.id.fileName(p.idx),
FileName: true,

Score: score,
DebugScore: debugScore,
}

for _, m := range ms {
Expand All @@ -157,12 +162,7 @@ func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int,
}
} else {
ms = breakMatchesOnNewlines(ms, p.data(false))
result = p.fillContentMatches(ms, numContextLines)
}

sects := p.docSections()
for i, m := range result {
result[i].Score, result[i].DebugScore = p.matchScore(sects, &m, language, debug)
result = p.fillContentMatches(ms, numContextLines, language, debug)
}

return result
Expand All @@ -180,6 +180,8 @@ func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines
// If the first match is a filename match, there will only be
// one match and the matched content will be the filename.

score, debugScore, _ := p.candidateMatchScore(ms, language, debug)

fileName := p.id.fileName(p.idx)
ranges := make([]Range, 0, len(ms))
for _, m := range ms {
Expand All @@ -202,20 +204,18 @@ func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines
ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1},
Ranges: ranges,
FileName: true,

Score: score,
DebugScore: debugScore,
}}
} else {
result = p.fillContentChunkMatches(ms, numContextLines)
}

sects := p.docSections()
for i, m := range result {
result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
result = p.fillContentChunkMatches(ms, numContextLines, language, debug)
}

return result
}

func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch {
func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch {
var result []LineMatch
for len(ms) > 0 {
m := ms[0]
Expand Down Expand Up @@ -271,19 +271,18 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin
finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines)
}

for _, m := range lineCands {
score, debugScore, symbolInfo := p.candidateMatchScore(lineCands, language, debug)
finalMatch.Score = score
finalMatch.DebugScore = debugScore

for i, m := range lineCands {
fragment := LineFragmentMatch{
Offset: m.byteOffset,
LineOffset: int(m.byteOffset) - lineStart,
MatchLength: int(m.byteMatchSz),
}
if m.symbol {
start := p.id.fileEndSymbol[p.idx]
fragment.SymbolInfo = p.id.symbols.data(start + m.symbolIdx)
if fragment.SymbolInfo != nil {
sec := p.docSections()[m.symbolIdx]
fragment.SymbolInfo.Sym = string(data[sec.Start:sec.End])
}
if i < len(symbolInfo) && symbolInfo[i] != nil {
fragment.SymbolInfo = symbolInfo[i]
}

finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
Expand All @@ -293,7 +292,7 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin
return result
}

func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch {
newlines := p.newlines()
data := p.data(false)

Expand All @@ -311,9 +310,10 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
chunks := chunkCandidates(ms, newlines, numContextLines)
chunkMatches := make([]ChunkMatch, 0, len(chunks))
for _, chunk := range chunks {
score, debugScore, symbolInfo := p.candidateMatchScore(chunk.candidates, language, debug)

ranges := make([]Range, 0, len(chunk.candidates))
var symbolInfo []*Symbol
for i, cm := range chunk.candidates {
for _, cm := range chunk.candidates {
startOffset := cm.byteOffset
endOffset := cm.byteOffset + cm.byteMatchSz
startLine, startLineOffset, _ := newlines.atOffset(startOffset)
Expand All @@ -331,19 +331,6 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
Column: columnHelper.get(endLineOffset, endOffset),
},
})

if cm.symbol {
if symbolInfo == nil {
symbolInfo = make([]*Symbol, len(chunk.candidates))
}
start := p.id.fileEndSymbol[p.idx]
si := p.id.symbols.data(start + cm.symbolIdx)
if si != nil {
sec := p.docSections()[cm.symbolIdx]
si.Sym = string(data[sec.Start:sec.End])
}
symbolInfo[i] = si
}
}

firstLineNumber := int(chunk.firstLine) - numContextLines
Expand All @@ -362,6 +349,8 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
FileName: false,
Ranges: ranges,
SymbolInfo: symbolInfo,
Score: score,
DebugScore: debugScore,
})
}
return chunkMatches
Expand Down Expand Up @@ -548,7 +537,7 @@ const (

// findSection checks whether a section defined by offset and size lies within
// one of the sections in secs.
func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) {
j := sort.Search(len(secs), func(i int) bool {
return secs[i].End >= off+sz
})
Expand All @@ -558,12 +547,43 @@ func findSection(secs []DocumentSection, off, sz uint32) (int, bool) {
}

if secs[j].Start <= off && off+sz <= secs[j].End {
return j, true
return uint32(j), true
}
return 0, false
}

func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, language string, debug bool) (float64, string) {
func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) {
if cm.fileName {
return DocumentSection{}, nil, false
}

secs := p.docSections()

secIdx, ok := cm.symbolIdx, cm.symbol
if !ok {
// Not from a symbol matchtree. Lets see if it intersects with a symbol.
secIdx, ok = findSection(secs, cm.byteOffset, cm.byteMatchSz)
}
if !ok {
return DocumentSection{}, nil, false
}

sec := secs[secIdx]

// Now lets hydrate in the SymbolInfo. We do not hydrate in SymbolInfo.Sym
// since some callsites do not need it stored, and that incurs an extra
// copy.
//
// 2024-01-08 we are refactoring this and the code path indicates this can
// fail, so callers need to handle nil symbol. However, it would be
// surprising that we have a matching section but not symbol data.
start := p.id.fileEndSymbol[p.idx]
si := p.id.symbols.data(start + secIdx)

return sec, si, true
}

func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language string, debug bool) (float64, string, []*Symbol) {
type debugScore struct {
what string
score float64
Expand All @@ -579,16 +599,15 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
score.score += s
}

data := p.data(m.FileName)
filename := p.data(true)
var symbolInfo []*Symbol

for i, r := range m.Ranges {
// calculate the start and end offset relative to the start of the content
relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
relEndOffset := int(r.End.ByteOffset - m.ContentStart.ByteOffset)
for i, m := range ms {
data := p.data(m.fileName)

startBoundary := relStartOffset < len(m.Content) && (relStartOffset == 0 || byteClass(m.Content[relStartOffset-1]) != byteClass(m.Content[relStartOffset]))
endBoundary := relEndOffset > 0 && (relEndOffset == len(m.Content) || byteClass(m.Content[relEndOffset-1]) != byteClass(m.Content[relEndOffset]))
endOffset := m.byteOffset + m.byteMatchSz
startBoundary := m.byteOffset < uint32(len(data)) && (m.byteOffset == 0 || byteClass(data[m.byteOffset-1]) != byteClass(data[m.byteOffset]))
endBoundary := endOffset > 0 && (endOffset == uint32(len(data)) || byteClass(data[endOffset-1]) != byteClass(data[endOffset]))

score.score = 0
score.what = ""
Expand All @@ -599,21 +618,20 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
addScore("PartialWordMatch", scorePartialWordMatch)
}

if m.FileName {
sep := bytes.LastIndexByte(m.Content, '/')
startMatch := relStartOffset == sep+1
endMatch := relEndOffset == len(m.Content)
if m.fileName {
sep := bytes.LastIndexByte(data, '/')
startMatch := int(m.byteOffset) == sep+1
endMatch := endOffset == uint32(len(data))
if startMatch && endMatch {
addScore("Base", scoreBase)
} else if startMatch || endMatch {
addScore("EdgeBase", (scoreBase+scorePartialBase)/2)
} else if sep < relStartOffset {
} else if sep < int(m.byteOffset) {
addScore("InnerBase", scorePartialBase)
}
} else if secIdx, ok := findSection(secs, uint32(r.Start.ByteOffset), uint32(r.End.ByteOffset-r.Start.ByteOffset)); ok {
sec := secs[secIdx]
startMatch := sec.Start == uint32(r.Start.ByteOffset)
endMatch := sec.End == uint32(r.End.ByteOffset)
} else if sec, si, ok := p.findSymbol(m); ok {
startMatch := sec.Start == m.byteOffset
endMatch := sec.End == endOffset
if startMatch && endMatch {
addScore("Symbol", scoreSymbol)
} else if startMatch || endMatch {
Expand All @@ -622,19 +640,23 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
addScore("InnerSymbol", scorePartialSymbol)
}

var si *Symbol
if m.SymbolInfo != nil {
si = m.SymbolInfo[i]
}
if si == nil {
// for non-symbol queries, we need to hydrate in SymbolInfo.
start := p.id.fileEndSymbol[p.idx]
si = p.id.symbols.data(start + uint32(secIdx))
}
// Score based on symbol data
if si != nil {
symbolKind := ctags.ParseSymbolKind(si.Kind)
sym := sectionSlice(data, sec)

addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))

// This is from a symbol tree, so we need to store the symbol
// information.
if m.symbol {
if symbolInfo == nil {
symbolInfo = make([]*Symbol, len(ms))
}
// findSymbols does not hydrate in Sym. So we need to store it.
si.Sym = string(sym)
symbolInfo[i] = si
}
}
}

Expand All @@ -648,7 +670,7 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
maxScore.what = fmt.Sprintf("score:%.2f <- %s", maxScore.score, strings.TrimSuffix(maxScore.what, ", "))
}

return maxScore.score, maxScore.what
return maxScore.score, maxScore.what, symbolInfo
}

func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, language string, debug bool) (float64, string) {
Expand Down
2 changes: 1 addition & 1 deletion eval_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func TestSearch_ShardRepoMaxMatchCountOpt(t *testing.T) {

t.Run("stats", func(t *testing.T) {
got, want := sr.Stats, Stats{
ContentBytesLoaded: 2,
ContentBytesLoaded: 0,
FileCount: 2,
FilesConsidered: 2,
FilesSkipped: 2,
Expand Down
10 changes: 9 additions & 1 deletion read.go
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,15 @@ func (d *indexData) readDocSections(i uint32, buf []DocumentSection) ([]Document
return nil, 0, err
}

return unmarshalDocSections(blob, buf), sec.sz, nil
ds := unmarshalDocSections(blob, buf)

// can be nil if buf is nil and there are no doc sections. However, we rely
// on it being non-nil to cache the read.
if ds == nil {
ds = make([]DocumentSection, 0)
}

return ds, sec.sz, nil
}

func (d *indexData) readRanks(toc *indexTOC) error {
Expand Down

0 comments on commit b82a981

Please sign in to comment.