diff --git a/build/scoring_test.go b/build/scoring_test.go index 8a93d08b..e3941fec 100644 --- a/build/scoring_test.go +++ b/build/scoring_test.go @@ -60,7 +60,59 @@ func TestFileNameMatch(t *testing.T) { } for _, c := range cases { - checkScoring(t, c, ctags.UniversalCTags) + checkScoring(t, c, false, ctags.UniversalCTags) + } +} + +func TestBM25(t *testing.T) { + exampleJava, err := os.ReadFile("./testdata/example.java") + if err != nil { + t.Fatal(err) + } + + cases := []scoreCase{ + { + // Matches on both filename and content + fileName: "example.java", + query: &query.Substring{Pattern: "example"}, + content: exampleJava, + language: "Java", + // keyword-score:1.63 (sum-tf: 6.00, length-ratio: 2.00) + wantScore: 1.63, + }, { + // Matches only on content + fileName: "example.java", + query: &query.And{Children: []query.Q{ + &query.Substring{Pattern: "inner"}, + &query.Substring{Pattern: "static"}, + &query.Substring{Pattern: "interface"}, + }}, + content: exampleJava, + language: "Java", + // keyword-score:5.75 (sum-tf: 56.00, length-ratio: 2.00) + wantScore: 5.75, + }, + { + // Matches only on filename + fileName: "example.java", + query: &query.Substring{Pattern: "java"}, + content: exampleJava, + language: "Java", + // keyword-score:1.07 (sum-tf: 2.00, length-ratio: 2.00) + wantScore: 1.07, + }, + { + // Matches only on filename, and content is missing + fileName: "a/b/c/config.go", + query: &query.Substring{Pattern: "config.go"}, + language: "Go", + // keyword-score:1.91 (sum-tf: 2.00, length-ratio: 0.00) + wantScore: 1.91, + }, + } + + for _, c := range cases { + checkScoring(t, c, true, ctags.UniversalCTags) } } @@ -197,7 +249,7 @@ func TestJava(t *testing.T) { } for _, c := range cases { - checkScoring(t, c, ctags.UniversalCTags) + checkScoring(t, c, false, ctags.UniversalCTags) } } @@ -261,7 +313,7 @@ func TestKotlin(t *testing.T) { parserType := ctags.UniversalCTags for _, c := range cases { t.Run(c.language, func(t *testing.T) { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) }) } } @@ -318,7 +370,7 @@ func TestCpp(t *testing.T) { parserType := ctags.UniversalCTags for _, c := range cases { t.Run(c.language, func(t *testing.T) { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) }) } } @@ -350,7 +402,7 @@ func TestPython(t *testing.T) { for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { for _, c := range cases { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) } } @@ -364,7 +416,7 @@ func TestPython(t *testing.T) { wantScore: 7860, } - checkScoring(t, scipOnlyCase, ctags.ScipCTags) + checkScoring(t, scipOnlyCase, false, ctags.ScipCTags) } func TestRuby(t *testing.T) { @@ -402,7 +454,7 @@ func TestRuby(t *testing.T) { for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { for _, c := range cases { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) } } } @@ -450,7 +502,7 @@ func TestScala(t *testing.T) { parserType := ctags.UniversalCTags for _, c := range cases { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) } } @@ -509,7 +561,7 @@ func Get() { for _, parserType := range []ctags.CTagsParserType{ctags.UniversalCTags, ctags.ScipCTags} { for _, c := range cases { - checkScoring(t, c, parserType) + checkScoring(t, c, false, parserType) } } } @@ -532,7 +584,7 @@ func skipIfCTagsUnavailable(t *testing.T, parserType ctags.CTagsParserType) { } } -func checkScoring(t *testing.T, c scoreCase, parserType ctags.CTagsParserType) { +func checkScoring(t *testing.T, c scoreCase, keywordScoring bool, parserType ctags.CTagsParserType) { skipIfCTagsUnavailable(t, parserType) name := c.language @@ -572,7 +624,10 @@ func checkScoring(t *testing.T, c scoreCase, parserType ctags.CTagsParserType) { } defer ss.Close() - srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{DebugScore: true}) + srs, err := ss.Search(context.Background(), c.query, &zoekt.SearchOptions{ + UseKeywordScoring: keywordScoring, + ChunkMatches: true, + DebugScore: true}) if err != nil { t.Fatal(err) } @@ -582,7 +637,7 @@ func checkScoring(t *testing.T, c scoreCase, parserType ctags.CTagsParserType) { } if got := srs.Files[0].Score; math.Abs(got-c.wantScore) > epsilon { - t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].LineMatches[0].DebugScore) + t.Fatalf("score: want %f, got %f\ndebug: %s\ndebugscore: %s", c.wantScore, got, srs.Files[0].Debug, srs.Files[0].ChunkMatches[0].DebugScore) } if got := srs.Files[0].Language; got != c.language { diff --git a/contentprovider.go b/contentprovider.go index de0c8d03..c3c9ef6e 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -137,82 +137,106 @@ func (p *contentProvider) findOffset(filename bool, r uint32) uint32 { return byteOff } +// fillMatches converts the internal candidateMatch slice into our API's LineMatch. +// It only ever returns content XOR filename matches, not both. If there are any +// content matches, these are always returned, and we omit filename matches. +// +// Performance invariant: ms is sorted and non-overlapping. +// +// Note: the byte slices may be backed by mmapped data, so before being +// returned by the API it needs to be copied. func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { - var result []LineMatch - if ms[0].fileName { - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) + var filenameMatches []*candidateMatch + contentMatches := ms[:0] - // There is only "line" in a filename. - res := LineMatch{ - Line: p.id.fileName(p.idx), - FileName: true, - - Score: score, - DebugScore: debugScore, + for _, m := range ms { + if m.fileName { + filenameMatches = append(filenameMatches, m) + } else { + contentMatches = append(contentMatches, m) } + } - for _, m := range ms { - res.LineFragments = append(res.LineFragments, LineFragmentMatch{ - LineOffset: int(m.byteOffset), - MatchLength: int(m.byteMatchSz), - Offset: m.byteOffset, - }) + // If there are any content matches, we only return these and skip filename matches. + if len(contentMatches) > 0 { + contentMatches = breakMatchesOnNewlines(contentMatches, p.data(false)) + return p.fillContentMatches(contentMatches, numContextLines, language, debug) + } - result = []LineMatch{res} - } - } else { - ms = breakMatchesOnNewlines(ms, p.data(false)) - result = p.fillContentMatches(ms, numContextLines, language, debug) + // Otherwise, we return a single line containing the filematch match. + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) + res := LineMatch{ + Line: p.id.fileName(p.idx), + FileName: true, + Score: score, + DebugScore: debugScore, } - return result + for _, m := range ms { + res.LineFragments = append(res.LineFragments, LineFragmentMatch{ + LineOffset: int(m.byteOffset), + MatchLength: int(m.byteMatchSz), + Offset: m.byteOffset, + }) + } + + return []LineMatch{res} + } -// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. +// fillChunkMatches converts the internal candidateMatch slice into our API's ChunkMatch. +// It only ever returns content XOR filename matches, not both. If there are any content +// matches, these are always returned, and we omit filename matches. // // Performance invariant: ms is sorted and non-overlapping. // // Note: the byte slices may be backed by mmapped data, so before being // returned by the API it needs to be copied. func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { - var result []ChunkMatch - if ms[0].fileName { - // If the first match is a filename match, there will only be - // one match and the matched content will be the filename. - - score, debugScore, _ := p.candidateMatchScore(ms, language, debug) + var filenameMatches []*candidateMatch + contentMatches := ms[:0] - fileName := p.id.fileName(p.idx) - ranges := make([]Range, 0, len(ms)) - for _, m := range ms { - ranges = append(ranges, Range{ - Start: Location{ - ByteOffset: m.byteOffset, - LineNumber: 1, - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), - }, - End: Location{ - ByteOffset: m.byteOffset + m.byteMatchSz, - LineNumber: 1, - Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), - }, - }) + for _, m := range ms { + if m.fileName { + filenameMatches = append(filenameMatches, m) + } else { + contentMatches = append(contentMatches, m) } + } - result = []ChunkMatch{{ - Content: fileName, - ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, - Ranges: ranges, - FileName: true, + // If there are any content matches, we only return these and skip filename matches. + if len(contentMatches) > 0 { + return p.fillContentChunkMatches(contentMatches, numContextLines, language, debug) + } - Score: score, - DebugScore: debugScore, - }} - } else { - result = p.fillContentChunkMatches(ms, numContextLines, language, debug) + // Otherwise, we return a single chunk representing the filename match. + score, debugScore, _ := p.candidateMatchScore(filenameMatches, language, debug) + fileName := p.id.fileName(p.idx) + ranges := make([]Range, 0, len(ms)) + for _, m := range ms { + ranges = append(ranges, Range{ + Start: Location{ + ByteOffset: m.byteOffset, + LineNumber: 1, + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset]) + 1), + }, + End: Location{ + ByteOffset: m.byteOffset + m.byteMatchSz, + LineNumber: 1, + Column: uint32(utf8.RuneCount(fileName[:m.byteOffset+m.byteMatchSz]) + 1), + }, + }) } - return result + return []ChunkMatch{{ + Content: fileName, + ContentStart: Location{ByteOffset: 0, LineNumber: 1, Column: 1}, + Ranges: ranges, + FileName: true, + + Score: score, + DebugScore: debugScore, + }} } func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []LineMatch { diff --git a/eval.go b/eval.go index 83342bb1..7cbd3658 100644 --- a/eval.go +++ b/eval.go @@ -382,10 +382,9 @@ func addRepo(res *SearchResult, repo *Repository) { res.LineFragments[repo.Name] = repo.LineFragmentTemplate } -// Gather matches from this document. This never returns a mixture of -// filename/content matches: if there are content matches, all -// filename matches are trimmed from the result. The matches are -// returned in document order and are non-overlapping. +// Gather matches from this document. The matches are returned in document +// order and are non-overlapping. All filename and content matches are +// returned, with filename matches first. // // If `merge` is set, overlapping and adjacent matches will be merged // into a single match. Otherwise, overlapping matches will be removed, @@ -407,23 +406,6 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT } }) - // If there are content matches, trim all filename matches. - foundContentMatch := false - for _, c := range cands { - if !c.fileName { - foundContentMatch = true - break - } - } - - res := cands[:0] - for _, c := range cands { - if !foundContentMatch || !c.fileName { - res = append(res, c) - } - } - cands = res - // If we found no candidate matches at all, assume there must have been a match on filename. if len(cands) == 0 { nm := d.fileName(nextDoc) @@ -439,23 +421,30 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT }} } - if merge { - // Merge adjacent candidates. This guarantees that the matches - // are non-overlapping. - sort.Sort((sortByOffsetSlice)(cands)) - res = cands[:0] - mergeRun := 1 - for i, c := range cands { - if i == 0 { - res = append(res, c) - continue - } - last := res[len(res)-1] + sort.Sort((sortByOffsetSlice)(cands)) + res := cands[:0] + mergeRun := 1 + for i, c := range cands { + if i == 0 { + res = append(res, c) + continue + } + + last := res[len(res)-1] + + // Never compare filename and content matches + if last.fileName != c.fileName { + res = append(res, c) + continue + } + + if merge { + // Merge adjacent candidates. This guarantees that the matches + // are non-overlapping. lastEnd := last.byteOffset + last.byteMatchSz end := c.byteOffset + c.byteMatchSz if lastEnd >= c.byteOffset { mergeRun++ - // Average out the score across the merged candidates. Only do it if // we are boosting to avoid floating point funkiness in the normal // case. @@ -472,27 +461,16 @@ func (d *indexData) gatherMatches(nextDoc uint32, mt matchTree, known map[matchT } else { mergeRun = 1 } - - res = append(res, c) - } - } else { - // Remove overlapping candidates. This guarantees that the matches - // are non-overlapping, but also preserves expected match counts. - sort.Sort((sortByOffsetSlice)(cands)) - res = cands[:0] - for i, c := range cands { - if i == 0 { - res = append(res, c) - continue - } - last := res[len(res)-1] + } else { + // Remove overlapping candidates. This guarantees that the matches + // are non-overlapping, but also preserves expected match counts. lastEnd := last.byteOffset + last.byteMatchSz if lastEnd > c.byteOffset { continue } - - res = append(res, c) } + + res = append(res, c) } return res } @@ -502,6 +480,11 @@ type sortByOffsetSlice []*candidateMatch func (m sortByOffsetSlice) Len() int { return len(m) } func (m sortByOffsetSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] } func (m sortByOffsetSlice) Less(i, j int) bool { + // Sort all filename matches to the start + if m[i].fileName != m[j].fileName { + return m[i].fileName + } + if m[i].byteOffset == m[j].byteOffset { // tie break if same offset // Prefer longer candidates if starting at same position return m[i].byteMatchSz > m[j].byteMatchSz diff --git a/score.go b/score.go index d5c721a8..115eabd2 100644 --- a/score.go +++ b/score.go @@ -119,6 +119,9 @@ func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, kn // algorithm for keyword search: https://en.wikipedia.org/wiki/Okapi_BM25. It implements all parts of the formula // except inverse document frequency (idf), since we don't have access to global term frequency statistics. // +// Filename matches count twice as much as content matches. This mimics a common text search strategy where you +// 'boost' matches on document titles. +// // This scoring strategy ignores all other signals including document ranks. This keeps things simple for now, // since BM25 is not normalized and can be tricky to combine with other scoring signals. func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands []*candidateMatch, opts *SearchOptions) { @@ -127,7 +130,12 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [ termFreqs := map[string]int{} for _, cand := range cands { term := string(cand.substrLowered) - termFreqs[term]++ + + if cand.fileName { + termFreqs[term] += 2 + } else { + termFreqs[term]++ + } } // Compute the file length ratio. Usually the calculation would be based on terms, but using @@ -135,6 +143,11 @@ func (d *indexData) scoreFileUsingBM25(fileMatch *FileMatch, doc uint32, cands [ fileLength := float64(d.boundaries[doc+1] - d.boundaries[doc]) numFiles := len(d.boundaries) averageFileLength := float64(d.boundaries[numFiles-1]) / float64(numFiles) + + // This is very unlikely, but explicitly guard against division by zero. + if averageFileLength == 0 { + averageFileLength++ + } L := fileLength / averageFileLength // Use standard parameter defaults (used in Lucene and academic papers)