score: introduce query.Boost to scale score

This commit introduces a new primitive Boost to our query language. It allows boosting (or dampening) the contribution to the score a query atoms will match contribute. To achieve this we introduce boostMatchTree which records this weight. We then adjust the visitMatches to take an initial score weight (1.0), and then each time we recurse through a boostMatchTree the score weight is multiplied by the boost weight. Additionally candidateMatch now has a new field, scoreWeight, which records the weight at time of candidate collection. Without boosting in the query this value will always be 1. Finally when scoring a candidateMatch we take the final score for it and multiply it by scoreWeight. Note: we do not expose a way to set this in the query language, only the query API. Test Plan: Manual testing against webserver via the new phrase-boost URL param. Additionally updated ranking tests to use the phrase booster.
sourcegraph · Jan 29, 2024 · 11d7f81 · 11d7f81
1 parent cdb1665
commit 11d7f81
Show file tree

Hide file tree

Showing 13 changed files with 220 additions and 52 deletions.
diff --git a/api_test.go b/api_test.go
@@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) {
 		size: 112,
 	}, {
 		v:    candidateMatch{},
-		size: 72,
+		size: 80,
 	}, {
 		v:    candidateChunk{},
 		size: 40,

diff --git a/contentprovider.go b/contentprovider.go
@@ -660,6 +660,13 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str
 			}
 		}
 
+		if m.scoreWeight != 1 { // should we be using epsilon comparison here?
+			score.score = score.score * m.scoreWeight
+			if debug {
+				score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
+			}
+		}
+
 		if score.score > maxScore.score {
 			maxScore.score = score.score
 			maxScore.what = score.what

diff --git a/eval.go b/eval.go
@@ -420,7 +420,7 @@ nextFileMatch:
 // whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 	atomMatchCount := 0
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		atomMatchCount++
 	})
 
@@ -544,6 +544,13 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 	return m[i].byteOffset < m[j].byteOffset
 }
 
+func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
+	for _, m := range cm {
+		m.scoreWeight = scoreWeight
+	}
+	return cm
+}
+
 // Gather matches from this document. This never returns a mixture of
 // filename/content matches: if there are content matches, all
 // filename matches are trimmed from the result. The matches are
@@ -554,18 +561,20 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 // but adjacent matches will remain.
 func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
 	var cands []*candidateMatch
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
+		// TODO apply scoreWeight to candidates
+		_ = scoreWeight
 		if smt, ok := mt.(*substrMatchTree); ok {
-			cands = append(cands, smt.current...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
 		}
 		if rmt, ok := mt.(*regexpMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if rmt, ok := mt.(*wordMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
-			cands = append(cands, smt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
 		}
 	})
 
@@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 		// are non-overlapping.
 		sort.Sort((sortByOffsetSlice)(cands))
 		res = cands[:0]
+		mergeRun := 1
 		for i, c := range cands {
 			if i == 0 {
 				res = append(res, c)
@@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 			lastEnd := last.byteOffset + last.byteMatchSz
 			end := c.byteOffset + c.byteMatchSz
 			if lastEnd >= c.byteOffset {
+				mergeRun++
+
+				// Average out the score across the merged candidates. Only do it if
+				// we are boosting to avoid floating point funkiness in the normal
+				// case.
+				if last.scoreWeight != 1 && c.scoreWeight != 1 {
+					last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
+				}
+
+				// latest candidate goes further, update our end
 				if end > lastEnd {
 					last.byteMatchSz = end - last.byteOffset
 				}
+
 				continue
+			} else {
+				mergeRun = 1
 			}
 
 			res = append(res, c)
@@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int {
 // returns all branches containing docID.
 func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
 	var mask uint64
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		bq, ok := mt.(*branchQueryMatchTree)
 		if !ok {
 			return

diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go
@@ -118,6 +118,10 @@ func TestRanking(t *testing.T) {
 				t.Fatal(err)
 			}
 
+			// q is marshalled as part of the test, so avoid our rewrites for
+			// ranking.
+			qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})
+
 			sOpts := zoekt.SearchOptions{
 				// Use the same options sourcegraph has by default
 				ChunkMatches:       true,
@@ -128,7 +132,7 @@ func TestRanking(t *testing.T) {
 
 				DebugScore: *debugScore,
 			}
-			result, err := ss.Search(context.Background(), q, &sOpts)
+			result, err := ss.Search(context.Background(), qSearch, &sOpts)
 			if err != nil {
 				t.Fatal(err)
 			}

diff --git a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt
@@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr
 targetRank: 1
 
 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go**
+30:	return nil, errors.New("assets are not configured for this binary, please see ui/assets")
+34:	panic("assets are not configured for this binary, please see ui/assets")
 33:func (p FailingAssetsProvider) Assets() http.FileSystem {
-14:	Assets() http.FileSystem
-1:package assets
 hidden 12 more line matches
 
 github.com/sourcegraph/sourcegraph/schema/schema.go

diff --git a/internal/e2e/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt
@@ -1,6 +1,30 @@
 queryString: generate unit test
 query: (and substr:"generate" substr:"unit" substr:"test")
-targetRank: 11
+targetRank: 1
+
+**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts**
+16:    public title = 'Generate Unit Test'
+14:export class GenerateTest implements Recipe {
+15:    public id: RecipeID = 'generate-unit-test'
+hidden 3 more line matches
+
+github.com/sourcegraph/sourcegraph/client/jetbrains/README.md
+40:- Generate unit test
+41:- Generate docstring
+61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code.
+hidden 7 more line matches
+
+github.com/sourcegraph/cody/vscode/CHANGELOG.md
+298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602)
+218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+hidden 17 more line matches
+
+github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md
+158:- Generate unit test
+138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://<your-instance>.sourcegraph.com/users/<your-instance>/settings/tokens`). From here, generate a new access token.
+159:- Generate docstring
+hidden 3 more line matches
 
 github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go
 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) {
@@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo
 75:	SampleUnit        string // Unit for the sample data from the profile.
 hidden 48 more line matches
 
-github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua
-9:  generate = function(_, paths)
-6:  patterns = { pattern.new_path_basename "sg-test" },
-8:  -- Invoked as part of unit tests for the autoindexing service
-hidden 1 more line matches
-
-github.com/golang/go/src/cmd/internal/testdir/testdir_test.go
-273:type test struct {
-74:func Test(t *testing.T) {
-263:type testCommon struct {
-hidden 120 more line matches
-
-github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go
-65:	Unit string // seconds, nanoseconds, bytes, etc
-77:	NumUnit  map[string][]string
-68:	unitX int64
-hidden 44 more line matches
-
-github.com/golang/go/src/cmd/link/internal/loader/loader.go
-79:	unit         *sym.CompilationUnit
-1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit {
-228:	generatedSyms        Bitmap // symbols that generate their content, indexed by ext sym idx
-hidden 50 more line matches
-
 hidden 245 more file matches
diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt
@@ -1,4 +1,4 @@
 queries: 14
-recall@1: 7 (50%)
-recall@5: 9 (64%)
-mrr: 0.579471
+recall@1: 9 (64%)
+recall@5: 11 (79%)
+mrr: 0.710733
diff --git a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt
@@ -1,6 +1,11 @@
 queryString: sourcegraph/server docker image build
 query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build")
-targetRank: 14
+targetRank: 1
+
+**github.com/sourcegraph/sourcegraph/dev/tools.go**
+7:	// zoekt-* used in sourcegraph/server docker image build
+1://go:build tools
+2:// +build tools
 
 github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go
 458:	Build       int
@@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go
 50:	latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8")
 hidden 19 more line matches
 
-github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md
-1:# Docker Single Container Deployment
-294:### Insiders build
-238:### File system performance on Docker for Mac
-hidden 52 more line matches
-
 hidden 15 more file matches
diff --git a/matchiter.go b/matchiter.go
@@ -27,6 +27,8 @@ type candidateMatch struct {
 	substrBytes   []byte
 	substrLowered []byte
 
+	scoreWeight float64
+
 	file      uint32
 	symbolIdx uint32
 

diff --git a/matchtree.go b/matchtree.go
@@ -170,6 +170,11 @@ type fileNameMatchTree struct {
 	child matchTree
 }
 
+type boostMatchTree struct {
+	child  matchTree
+	weight float64
+}
+
 // Don't visit this subtree for collecting matches.
 type noVisitMatchTree struct {
 	matchTree
@@ -392,6 +397,10 @@ func (t *fileNameMatchTree) prepare(doc uint32) {
 	t.child.prepare(doc)
 }
 
+func (t *boostMatchTree) prepare(doc uint32) {
+	t.child.prepare(doc)
+}
+
 func (t *substrMatchTree) prepare(nextDoc uint32) {
 	t.matchIterator.prepare(nextDoc)
 	t.current = t.matchIterator.candidates()
@@ -455,6 +464,10 @@ func (t *fileNameMatchTree) nextDoc() uint32 {
 	return t.child.nextDoc()
 }
 
+func (t *boostMatchTree) nextDoc() uint32 {
+	return t.child.nextDoc()
+}
+
 func (t *branchQueryMatchTree) nextDoc() uint32 {
 	var start uint32
 	if t.firstDone {
@@ -515,6 +528,10 @@ func (t *fileNameMatchTree) String() string {
 	return fmt.Sprintf("f(%v)", t.child)
 }
 
+func (t *boostMatchTree) String() string {
+	return fmt.Sprintf("boost(%f, %v)", t.weight, t.child)
+}
+
 func (t *substrMatchTree) String() string {
 	f := ""
 	if t.fileName {
@@ -556,6 +573,8 @@ func visitMatchTree(t matchTree, f func(matchTree)) {
 		visitMatchTree(s.child, f)
 	case *fileNameMatchTree:
 		visitMatchTree(s.child, f)
+	case *boostMatchTree:
+		visitMatchTree(s.child, f)
 	case *symbolSubstrMatchTree:
 		visitMatchTree(s.substrMatchTree, f)
 	case *symbolRegexpMatchTree:
@@ -575,33 +594,41 @@ func updateMatchTreeStats(mt matchTree, stats *Stats) {
 	})
 }
 
+func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
+	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
+		f(mt)
+	})
+}
+
 // visitMatches visits all atoms which can contribute matches. Note: This
 // skips noVisitMatchTree.
-func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
+func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 	switch s := t.(type) {
 	case *andMatchTree:
 		for _, ch := range s.children {
 			if known[ch] {
-				visitMatches(ch, known, f)
+				visitMatches(ch, known, weight, f)
 			}
 		}
 	case *andLineMatchTree:
-		visitMatches(&s.andMatchTree, known, f)
+		visitMatches(&s.andMatchTree, known, weight, f)
 	case *orMatchTree:
 		for _, ch := range s.children {
 			if known[ch] {
-				visitMatches(ch, known, f)
+				visitMatches(ch, known, weight, f)
 			}
 		}
+	case *boostMatchTree:
+		visitMatches(s.child, known, weight*s.weight, f)
 	case *symbolSubstrMatchTree:
-		visitMatches(s.substrMatchTree, known, f)
+		visitMatches(s.substrMatchTree, known, weight, f)
 	case *notMatchTree:
 	case *noVisitMatchTree:
 		// don't collect into negative trees.
 	case *fileNameMatchTree:
 		// We will just gather the filename if we do not visit this tree.
 	default:
-		f(s)
+		f(s, weight)
 	}
 }
 
@@ -876,6 +903,10 @@ func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[mat
 	return evalMatchTree(cp, cost, known, t.child)
 }
 
+func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
+	return evalMatchTree(cp, cost, known, t.child)
+}
+
 func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 	if t.contEvaluated {
 		return matchesStateForSlice(t.current)
@@ -997,6 +1028,17 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
 			child: ct,
 		}, nil
 
+	case *query.Boost:
+		ct, err := d.newMatchTree(s.Child, opt)
+		if err != nil {
+			return nil, err
+		}
+
+		return &boostMatchTree{
+			child:  ct,
+			weight: s.Weight,
+		}, nil
+
 	case *query.Substring:
 		return d.newSubstringMatchTree(s)
 
@@ -1288,6 +1330,8 @@ func pruneMatchTree(mt matchTree) (matchTree, error) {
 		}
 	case *fileNameMatchTree:
 		mt.child, err = pruneMatchTree(mt.child)
+	case *boostMatchTree:
+		mt.child, err = pruneMatchTree(mt.child)
 	case *andLineMatchTree:
 		child, err := pruneMatchTree(&mt.andMatchTree)
 		if err != nil {