From 340c5f85f0b7b6d76ac684bdcd49b800a8e9ee6d Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 29 Jan 2024 13:33:24 +0200 Subject: [PATCH] score: introduce query.Boost to scale score (#728) This commit introduces a new primitive Boost to our query language. It allows boosting (or dampening) the contribution to the score a query atoms will match contribute. To achieve this we introduce boostMatchTree which records this weight. We then adjust the visitMatches to take an initial score weight (1.0), and then each time we recurse through a boostMatchTree the score weight is multiplied by the boost weight. Additionally candidateMatch now has a new field, scoreWeight, which records the weight at time of candidate collection. Without boosting in the query this value will always be 1. Finally when scoring a candidateMatch we take the final score for it and multiply it by scoreWeight. Note: we do not expose a way to set this in the query language, only the query API. Test Plan: Manual testing against webserver via the new phrase-boost URL param. Additionally updated ranking tests to use the phrase booster. --- api_test.go | 2 +- bits.go | 5 ++ contentprovider.go | 8 +++ eval.go | 37 ++++++++--- internal/e2e/e2e_rank_test.go | 6 +- ...ets_are_not_configured_for_this_binary.txt | 4 +- internal/e2e/testdata/generate_unit_test.txt | 50 +++++++-------- internal/e2e/testdata/rank_stats.txt | 6 +- .../sourcegraphserver_docker_image_build.txt | 13 ++-- matchiter.go | 2 + matchtree.go | 56 +++++++++++++++-- query/boost.go | 61 +++++++++++++++++++ query/query.go | 25 ++++++++ web/server.go | 7 +++ 14 files changed, 230 insertions(+), 52 deletions(-) create mode 100644 query/boost.go diff --git a/api_test.go b/api_test.go index ab13f145d..ca6b47d56 100644 --- a/api_test.go +++ b/api_test.go @@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) { size: 112, }, { v: candidateMatch{}, - size: 72, + size: 80, }, { v: candidateChunk{}, size: 40, diff --git a/bits.go b/bits.go index 649088c8a..8f6397049 100644 --- a/bits.go +++ b/bits.go @@ -16,6 +16,7 @@ package zoekt import ( "encoding/binary" + "math" "sort" "unicode" "unicode/utf8" @@ -391,3 +392,7 @@ func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) { func (m runeOffsetMap) sizeBytes() int { return 8 * len(m) } + +func epsilonEqualsOne(scoreWeight float64) bool { + return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9 +} diff --git a/contentprovider.go b/contentprovider.go index 15156cab0..69e6cb0ef 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -660,6 +660,14 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str } } + // scoreWeight != 1 means it affects score + if !epsilonEqualsOne(m.scoreWeight) { + score.score = score.score * m.scoreWeight + if debug { + score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) + } + } + if score.score > maxScore.score { maxScore.score = score.score maxScore.what = score.what diff --git a/eval.go b/eval.go index 7808f733b..0a136aa24 100644 --- a/eval.go +++ b/eval.go @@ -420,7 +420,7 @@ nextFileMatch: // whether there's an exact match on a symbol, the number of query clauses that matched, etc. func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { atomMatchCount := 0 - visitMatches(mt, known, func(mt matchTree) { + visitMatchAtoms(mt, known, func(mt matchTree) { atomMatchCount++ }) @@ -544,6 +544,15 @@ func (m sortByOffsetSlice) Less(i, j int) bool { return m[i].byteOffset < m[j].byteOffset } +// setScoreWeight is a helper used by gatherMatches to set the weight based on +// the score weight of the matchTree. +func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { + for _, m := range cm { + m.scoreWeight = scoreWeight + } + return cm +} + // Gather matches from this document. This never returns a mixture of // filename/content matches: if there are content matches, all // filename matches are trimmed from the result. The matches are @@ -554,18 +563,18 @@ func (m sortByOffsetSlice) Less(i, j int) bool { // but adjacent matches will remain. func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { var cands []*candidateMatch - visitMatches(mt, known, func(mt matchTree) { + visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { if smt, ok := mt.(*substrMatchTree); ok { - cands = append(cands, smt.current...) + cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) } if rmt, ok := mt.(*regexpMatchTree); ok { - cands = append(cands, rmt.found...) + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) } if rmt, ok := mt.(*wordMatchTree); ok { - cands = append(cands, rmt.found...) + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) } if smt, ok := mt.(*symbolRegexpMatchTree); ok { - cands = append(cands, smt.found...) + cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) } }) @@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid // are non-overlapping. sort.Sort((sortByOffsetSlice)(cands)) res = cands[:0] + mergeRun := 1 for i, c := range cands { if i == 0 { res = append(res, c) @@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid lastEnd := last.byteOffset + last.byteMatchSz end := c.byteOffset + c.byteMatchSz if lastEnd >= c.byteOffset { + mergeRun++ + + // Average out the score across the merged candidates. Only do it if + // we are boosting to avoid floating point funkiness in the normal + // case. + if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) { + last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun) + } + + // latest candidate goes further, update our end if end > lastEnd { last.byteMatchSz = end - last.byteOffset } + continue + } else { + mergeRun = 1 } res = append(res, c) @@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int { // returns all branches containing docID. func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { var mask uint64 - visitMatches(mt, known, func(mt matchTree) { + visitMatchAtoms(mt, known, func(mt matchTree) { bq, ok := mt.(*branchQueryMatchTree) if !ok { return diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go index e65cc0125..47e6666d7 100644 --- a/internal/e2e/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -118,6 +118,10 @@ func TestRanking(t *testing.T) { t.Fatal(err) } + // q is marshalled as part of the test, so avoid our rewrites for + // ranking. + qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{}) + sOpts := zoekt.SearchOptions{ // Use the same options sourcegraph has by default ChunkMatches: true, @@ -128,7 +132,7 @@ func TestRanking(t *testing.T) { DebugScore: *debugScore, } - result, err := ss.Search(context.Background(), q, &sOpts) + result, err := ss.Search(context.Background(), qSearch, &sOpts) if err != nil { t.Fatal(err) } diff --git a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt index a501118e4..a0e84ef88 100644 --- a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt +++ b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt @@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr targetRank: 1 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go** +30: return nil, errors.New("assets are not configured for this binary, please see ui/assets") +34: panic("assets are not configured for this binary, please see ui/assets") 33:func (p FailingAssetsProvider) Assets() http.FileSystem { -14: Assets() http.FileSystem -1:package assets hidden 12 more line matches github.com/sourcegraph/sourcegraph/schema/schema.go diff --git a/internal/e2e/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt index 57dad53e4..b545b3451 100644 --- a/internal/e2e/testdata/generate_unit_test.txt +++ b/internal/e2e/testdata/generate_unit_test.txt @@ -1,6 +1,30 @@ queryString: generate unit test query: (and substr:"generate" substr:"unit" substr:"test") -targetRank: 11 +targetRank: 1 + +**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts** +16: public title = 'Generate Unit Test' +14:export class GenerateTest implements Recipe { +15: public id: RecipeID = 'generate-unit-test' +hidden 3 more line matches + +github.com/sourcegraph/sourcegraph/client/jetbrains/README.md +40:- Generate unit test +41:- Generate docstring +61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code. +hidden 7 more line matches + +github.com/sourcegraph/cody/vscode/CHANGELOG.md +298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602) +218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) +264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) +hidden 17 more line matches + +github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md +158:- Generate unit test +138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://.sourcegraph.com/users//settings/tokens`). From here, generate a new access token. +159:- Generate docstring +hidden 3 more line matches github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) { @@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo 75: SampleUnit string // Unit for the sample data from the profile. hidden 48 more line matches -github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua -9: generate = function(_, paths) -6: patterns = { pattern.new_path_basename "sg-test" }, -8: -- Invoked as part of unit tests for the autoindexing service -hidden 1 more line matches - -github.com/golang/go/src/cmd/internal/testdir/testdir_test.go -273:type test struct { -74:func Test(t *testing.T) { -263:type testCommon struct { -hidden 120 more line matches - -github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go -65: Unit string // seconds, nanoseconds, bytes, etc -77: NumUnit map[string][]string -68: unitX int64 -hidden 44 more line matches - -github.com/golang/go/src/cmd/link/internal/loader/loader.go -79: unit *sym.CompilationUnit -1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit { -228: generatedSyms Bitmap // symbols that generate their content, indexed by ext sym idx -hidden 50 more line matches - hidden 245 more file matches diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt index 5de9de054..81969e3f6 100644 --- a/internal/e2e/testdata/rank_stats.txt +++ b/internal/e2e/testdata/rank_stats.txt @@ -1,4 +1,4 @@ queries: 14 -recall@1: 7 (50%) -recall@5: 9 (64%) -mrr: 0.579471 +recall@1: 9 (64%) +recall@5: 11 (79%) +mrr: 0.710733 diff --git a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt index b1c431e98..cf80e597e 100644 --- a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt +++ b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt @@ -1,6 +1,11 @@ queryString: sourcegraph/server docker image build query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build") -targetRank: 14 +targetRank: 1 + +**github.com/sourcegraph/sourcegraph/dev/tools.go** +7: // zoekt-* used in sourcegraph/server docker image build +1://go:build tools +2:// +build tools github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go 458: Build int @@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go 50: latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8") hidden 19 more line matches -github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md -1:# Docker Single Container Deployment -294:### Insiders build -238:### File system performance on Docker for Mac -hidden 52 more line matches - hidden 15 more file matches diff --git a/matchiter.go b/matchiter.go index 68c6e4856..98bf6b1ca 100644 --- a/matchiter.go +++ b/matchiter.go @@ -27,6 +27,8 @@ type candidateMatch struct { substrBytes []byte substrLowered []byte + scoreWeight float64 + file uint32 symbolIdx uint32 diff --git a/matchtree.go b/matchtree.go index 102e0bc26..1e43ac89f 100644 --- a/matchtree.go +++ b/matchtree.go @@ -170,6 +170,11 @@ type fileNameMatchTree struct { child matchTree } +type boostMatchTree struct { + child matchTree + boost float64 +} + // Don't visit this subtree for collecting matches. type noVisitMatchTree struct { matchTree @@ -392,6 +397,10 @@ func (t *fileNameMatchTree) prepare(doc uint32) { t.child.prepare(doc) } +func (t *boostMatchTree) prepare(doc uint32) { + t.child.prepare(doc) +} + func (t *substrMatchTree) prepare(nextDoc uint32) { t.matchIterator.prepare(nextDoc) t.current = t.matchIterator.candidates() @@ -455,6 +464,10 @@ func (t *fileNameMatchTree) nextDoc() uint32 { return t.child.nextDoc() } +func (t *boostMatchTree) nextDoc() uint32 { + return t.child.nextDoc() +} + func (t *branchQueryMatchTree) nextDoc() uint32 { var start uint32 if t.firstDone { @@ -515,6 +528,10 @@ func (t *fileNameMatchTree) String() string { return fmt.Sprintf("f(%v)", t.child) } +func (t *boostMatchTree) String() string { + return fmt.Sprintf("boost(%f, %v)", t.boost, t.child) +} + func (t *substrMatchTree) String() string { f := "" if t.fileName { @@ -556,6 +573,8 @@ func visitMatchTree(t matchTree, f func(matchTree)) { visitMatchTree(s.child, f) case *fileNameMatchTree: visitMatchTree(s.child, f) + case *boostMatchTree: + visitMatchTree(s.child, f) case *symbolSubstrMatchTree: visitMatchTree(s.substrMatchTree, f) case *symbolRegexpMatchTree: @@ -575,33 +594,41 @@ func updateMatchTreeStats(mt matchTree, stats *Stats) { }) } +func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { + visitMatches(t, known, 1, func(mt matchTree, _ float64) { + f(mt) + }) +} + // visitMatches visits all atoms which can contribute matches. Note: This // skips noVisitMatchTree. -func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { +func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { switch s := t.(type) { case *andMatchTree: for _, ch := range s.children { if known[ch] { - visitMatches(ch, known, f) + visitMatches(ch, known, weight, f) } } case *andLineMatchTree: - visitMatches(&s.andMatchTree, known, f) + visitMatches(&s.andMatchTree, known, weight, f) case *orMatchTree: for _, ch := range s.children { if known[ch] { - visitMatches(ch, known, f) + visitMatches(ch, known, weight, f) } } + case *boostMatchTree: + visitMatches(s.child, known, weight*s.boost, f) case *symbolSubstrMatchTree: - visitMatches(s.substrMatchTree, known, f) + visitMatches(s.substrMatchTree, known, weight, f) case *notMatchTree: case *noVisitMatchTree: // don't collect into negative trees. case *fileNameMatchTree: // We will just gather the filename if we do not visit this tree. default: - f(s) + f(s, weight) } } @@ -876,6 +903,10 @@ func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[mat return evalMatchTree(cp, cost, known, t.child) } +func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return evalMatchTree(cp, cost, known, t.child) +} + func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.contEvaluated { return matchesStateForSlice(t.current) @@ -997,6 +1028,17 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) child: ct, }, nil + case *query.Boost: + ct, err := d.newMatchTree(s.Child, opt) + if err != nil { + return nil, err + } + + return &boostMatchTree{ + child: ct, + boost: s.Boost, + }, nil + case *query.Substring: return d.newSubstringMatchTree(s) @@ -1288,6 +1330,8 @@ func pruneMatchTree(mt matchTree) (matchTree, error) { } case *fileNameMatchTree: mt.child, err = pruneMatchTree(mt.child) + case *boostMatchTree: + mt.child, err = pruneMatchTree(mt.child) case *andLineMatchTree: child, err := pruneMatchTree(&mt.andMatchTree) if err != nil { diff --git a/query/boost.go b/query/boost.go new file mode 100644 index 000000000..2c2291ba8 --- /dev/null +++ b/query/boost.go @@ -0,0 +1,61 @@ +package query + +type ExperimentalPhraseBoostOptions struct { + // The phrase needs to contain atleast this many terms. This is based on the + // parsed query. + // + // Defaults to 3. + MinTerms int + + // Boost is how much to multiply the phrase match scores by. + // + // Defaults to 20. + Boost float64 +} + +// ExperimentalPhraseBoost transforms q into a query containing exact matches +// to phrase boosted. opts control how and when the boosting is done. +// +// Note: This is a temporary API and will be removed in future commits. +func ExpirementalPhraseBoost(q Q, phrase string, opts ExperimentalPhraseBoostOptions) Q { + if opts.MinTerms == 0 { + opts.MinTerms = 3 + } + if opts.Boost == 0 { + opts.Boost = 20 + } + + contentAtoms := 0 + caseSensitive := false + VisitAtoms(q, func(q Q) { + switch s := q.(type) { + case *Regexp: + // Check atom is for content + if s.Content || (s.Content == s.FileName) { + caseSensitive = s.CaseSensitive + contentAtoms++ + } + case *Substring: + if s.Content || (s.Content == s.FileName) { + caseSensitive = s.CaseSensitive + contentAtoms++ + } + } + }) + + if contentAtoms < opts.MinTerms { + return q + } + + return NewOr( + &Boost{ + Boost: opts.Boost, + Child: &Substring{ + Pattern: phrase, + Content: true, + CaseSensitive: caseSensitive, + }, + }, + q, + ) +} diff --git a/query/query.go b/query/query.go index 6a8c5dd3a..479a0bb46 100644 --- a/query/query.go +++ b/query/query.go @@ -386,6 +386,19 @@ func (q *Type) String() string { } } +// Boost scales the contribution to score of descendents. +type Boost struct { + Child Q + // Boost will multiply the score of its descendents. Values less than 1 will + // give less importance while values greater than 1 will give more + // importance. + Boost float64 +} + +func (q *Boost) String() string { + return fmt.Sprintf("(boost %0.2f %s)", q.Boost, q.Child) +} + // Substring is the most basic query: a query for a substring. type Substring struct { Pattern string @@ -609,6 +622,9 @@ func flatten(q Q) (Q, bool) { case *Type: child, changed := flatten(s.Child) return &Type{Child: child, Type: s.Type}, changed + case *Boost: + child, changed := flatten(s.Child) + return &Boost{Child: child, Boost: s.Boost}, changed default: return q, false } @@ -680,6 +696,12 @@ func evalConstants(q Q) Q { return ch } return &Type{Child: ch, Type: s.Type} + case *Boost: + ch := evalConstants(s.Child) + if _, ok := ch.(*Const); ok { + return ch + } + return &Boost{Boost: s.Boost, Child: ch} case *Substring: if len(s.Pattern) == 0 { return &Const{true} @@ -728,6 +750,8 @@ func Map(q Q, f func(q Q) Q) Q { q = &Not{Child: Map(s.Child, f)} case *Type: q = &Type{Type: s.Type, Child: Map(s.Child, f)} + case *Boost: + q = &Boost{Boost: s.Boost, Child: Map(s.Child, f)} } return f(q) } @@ -768,6 +792,7 @@ func VisitAtoms(q Q, v func(q Q)) { case *Or: case *Not: case *Type: + case *Boost: default: v(iQ) } diff --git a/web/server.go b/web/server.go index 5fc7e0fb7..7096e5679 100644 --- a/web/server.go +++ b/web/server.go @@ -241,6 +241,13 @@ func (s *Server) serveSearchErr(r *http.Request) (*ApiSearchResult, error) { return nil, err } + // Experimental: The query string and boost exact phrases of it. + if phraseBoost, err := strconv.ParseFloat(qvals.Get("phrase-boost"), 64); err == nil { + q = query.ExpirementalPhraseBoost(q, queryStr, query.ExperimentalPhraseBoostOptions{ + Boost: phraseBoost, + }) + } + repoOnly := true query.VisitAtoms(q, func(q query.Q) { _, ok := q.(*query.Repo)