diff --git a/api_test.go b/api_test.go index ab13f145d..ca6b47d56 100644 --- a/api_test.go +++ b/api_test.go @@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) { size: 112, }, { v: candidateMatch{}, - size: 72, + size: 80, }, { v: candidateChunk{}, size: 40, diff --git a/contentprovider.go b/contentprovider.go index 15156cab0..fca195fec 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -660,6 +660,13 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str } } + if m.scoreWeight != 1 { // should we be using epsilon comparison here? + score.score = score.score * m.scoreWeight + if debug { + score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight) + } + } + if score.score > maxScore.score { maxScore.score = score.score maxScore.what = score.what diff --git a/eval.go b/eval.go index 7808f733b..40cf1a0cf 100644 --- a/eval.go +++ b/eval.go @@ -420,7 +420,7 @@ nextFileMatch: // whether there's an exact match on a symbol, the number of query clauses that matched, etc. func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) { atomMatchCount := 0 - visitMatches(mt, known, func(mt matchTree) { + visitMatchAtoms(mt, known, func(mt matchTree) { atomMatchCount++ }) @@ -544,6 +544,13 @@ func (m sortByOffsetSlice) Less(i, j int) bool { return m[i].byteOffset < m[j].byteOffset } +func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch { + for _, m := range cm { + m.scoreWeight = scoreWeight + } + return cm +} + // Gather matches from this document. This never returns a mixture of // filename/content matches: if there are content matches, all // filename matches are trimmed from the result. The matches are @@ -554,18 +561,20 @@ func (m sortByOffsetSlice) Less(i, j int) bool { // but adjacent matches will remain. func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch { var cands []*candidateMatch - visitMatches(mt, known, func(mt matchTree) { + visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) { + // TODO apply scoreWeight to candidates + _ = scoreWeight if smt, ok := mt.(*substrMatchTree); ok { - cands = append(cands, smt.current...) + cands = append(cands, setScoreWeight(scoreWeight, smt.current)...) } if rmt, ok := mt.(*regexpMatchTree); ok { - cands = append(cands, rmt.found...) + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) } if rmt, ok := mt.(*wordMatchTree); ok { - cands = append(cands, rmt.found...) + cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...) } if smt, ok := mt.(*symbolRegexpMatchTree); ok { - cands = append(cands, smt.found...) + cands = append(cands, setScoreWeight(scoreWeight, smt.found)...) } }) @@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid // are non-overlapping. sort.Sort((sortByOffsetSlice)(cands)) res = cands[:0] + mergeRun := 1 for i, c := range cands { if i == 0 { res = append(res, c) @@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid lastEnd := last.byteOffset + last.byteMatchSz end := c.byteOffset + c.byteMatchSz if lastEnd >= c.byteOffset { + mergeRun++ + + // Average out the score across the merged candidates. Only do it if + // we are boosting to avoid floating point funkiness in the normal + // case. + if last.scoreWeight != 1 && c.scoreWeight != 1 { + last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun) + } + + // latest candidate goes further, update our end if end > lastEnd { last.byteMatchSz = end - last.byteOffset } + continue + } else { + mergeRun = 1 } res = append(res, c) @@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int { // returns all branches containing docID. func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string { var mask uint64 - visitMatches(mt, known, func(mt matchTree) { + visitMatchAtoms(mt, known, func(mt matchTree) { bq, ok := mt.(*branchQueryMatchTree) if !ok { return diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go index e65cc0125..47e6666d7 100644 --- a/internal/e2e/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -118,6 +118,10 @@ func TestRanking(t *testing.T) { t.Fatal(err) } + // q is marshalled as part of the test, so avoid our rewrites for + // ranking. + qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{}) + sOpts := zoekt.SearchOptions{ // Use the same options sourcegraph has by default ChunkMatches: true, @@ -128,7 +132,7 @@ func TestRanking(t *testing.T) { DebugScore: *debugScore, } - result, err := ss.Search(context.Background(), q, &sOpts) + result, err := ss.Search(context.Background(), qSearch, &sOpts) if err != nil { t.Fatal(err) } diff --git a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt index a501118e4..a0e84ef88 100644 --- a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt +++ b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt @@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr targetRank: 1 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go** +30: return nil, errors.New("assets are not configured for this binary, please see ui/assets") +34: panic("assets are not configured for this binary, please see ui/assets") 33:func (p FailingAssetsProvider) Assets() http.FileSystem { -14: Assets() http.FileSystem -1:package assets hidden 12 more line matches github.com/sourcegraph/sourcegraph/schema/schema.go diff --git a/internal/e2e/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt index 57dad53e4..b545b3451 100644 --- a/internal/e2e/testdata/generate_unit_test.txt +++ b/internal/e2e/testdata/generate_unit_test.txt @@ -1,6 +1,30 @@ queryString: generate unit test query: (and substr:"generate" substr:"unit" substr:"test") -targetRank: 11 +targetRank: 1 + +**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts** +16: public title = 'Generate Unit Test' +14:export class GenerateTest implements Recipe { +15: public id: RecipeID = 'generate-unit-test' +hidden 3 more line matches + +github.com/sourcegraph/sourcegraph/client/jetbrains/README.md +40:- Generate unit test +41:- Generate docstring +61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code. +hidden 7 more line matches + +github.com/sourcegraph/cody/vscode/CHANGELOG.md +298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602) +218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) +264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907) +hidden 17 more line matches + +github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md +158:- Generate unit test +138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://.sourcegraph.com/users//settings/tokens`). From here, generate a new access token. +159:- Generate docstring +hidden 3 more line matches github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) { @@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo 75: SampleUnit string // Unit for the sample data from the profile. hidden 48 more line matches -github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua -9: generate = function(_, paths) -6: patterns = { pattern.new_path_basename "sg-test" }, -8: -- Invoked as part of unit tests for the autoindexing service -hidden 1 more line matches - -github.com/golang/go/src/cmd/internal/testdir/testdir_test.go -273:type test struct { -74:func Test(t *testing.T) { -263:type testCommon struct { -hidden 120 more line matches - -github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go -65: Unit string // seconds, nanoseconds, bytes, etc -77: NumUnit map[string][]string -68: unitX int64 -hidden 44 more line matches - -github.com/golang/go/src/cmd/link/internal/loader/loader.go -79: unit *sym.CompilationUnit -1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit { -228: generatedSyms Bitmap // symbols that generate their content, indexed by ext sym idx -hidden 50 more line matches - hidden 245 more file matches diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt index 5de9de054..81969e3f6 100644 --- a/internal/e2e/testdata/rank_stats.txt +++ b/internal/e2e/testdata/rank_stats.txt @@ -1,4 +1,4 @@ queries: 14 -recall@1: 7 (50%) -recall@5: 9 (64%) -mrr: 0.579471 +recall@1: 9 (64%) +recall@5: 11 (79%) +mrr: 0.710733 diff --git a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt index b1c431e98..cf80e597e 100644 --- a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt +++ b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt @@ -1,6 +1,11 @@ queryString: sourcegraph/server docker image build query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build") -targetRank: 14 +targetRank: 1 + +**github.com/sourcegraph/sourcegraph/dev/tools.go** +7: // zoekt-* used in sourcegraph/server docker image build +1://go:build tools +2:// +build tools github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go 458: Build int @@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go 50: latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8") hidden 19 more line matches -github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md -1:# Docker Single Container Deployment -294:### Insiders build -238:### File system performance on Docker for Mac -hidden 52 more line matches - hidden 15 more file matches diff --git a/matchiter.go b/matchiter.go index 68c6e4856..98bf6b1ca 100644 --- a/matchiter.go +++ b/matchiter.go @@ -27,6 +27,8 @@ type candidateMatch struct { substrBytes []byte substrLowered []byte + scoreWeight float64 + file uint32 symbolIdx uint32 diff --git a/matchtree.go b/matchtree.go index 102e0bc26..e4fd17a9d 100644 --- a/matchtree.go +++ b/matchtree.go @@ -170,6 +170,11 @@ type fileNameMatchTree struct { child matchTree } +type boostMatchTree struct { + child matchTree + weight float64 +} + // Don't visit this subtree for collecting matches. type noVisitMatchTree struct { matchTree @@ -392,6 +397,10 @@ func (t *fileNameMatchTree) prepare(doc uint32) { t.child.prepare(doc) } +func (t *boostMatchTree) prepare(doc uint32) { + t.child.prepare(doc) +} + func (t *substrMatchTree) prepare(nextDoc uint32) { t.matchIterator.prepare(nextDoc) t.current = t.matchIterator.candidates() @@ -455,6 +464,10 @@ func (t *fileNameMatchTree) nextDoc() uint32 { return t.child.nextDoc() } +func (t *boostMatchTree) nextDoc() uint32 { + return t.child.nextDoc() +} + func (t *branchQueryMatchTree) nextDoc() uint32 { var start uint32 if t.firstDone { @@ -515,6 +528,10 @@ func (t *fileNameMatchTree) String() string { return fmt.Sprintf("f(%v)", t.child) } +func (t *boostMatchTree) String() string { + return fmt.Sprintf("boost(%f, %v)", t.weight, t.child) +} + func (t *substrMatchTree) String() string { f := "" if t.fileName { @@ -556,6 +573,8 @@ func visitMatchTree(t matchTree, f func(matchTree)) { visitMatchTree(s.child, f) case *fileNameMatchTree: visitMatchTree(s.child, f) + case *boostMatchTree: + visitMatchTree(s.child, f) case *symbolSubstrMatchTree: visitMatchTree(s.substrMatchTree, f) case *symbolRegexpMatchTree: @@ -575,33 +594,41 @@ func updateMatchTreeStats(mt matchTree, stats *Stats) { }) } +func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) { + visitMatches(t, known, 1, func(mt matchTree, _ float64) { + f(mt) + }) +} + // visitMatches visits all atoms which can contribute matches. Note: This // skips noVisitMatchTree. -func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) { +func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) { switch s := t.(type) { case *andMatchTree: for _, ch := range s.children { if known[ch] { - visitMatches(ch, known, f) + visitMatches(ch, known, weight, f) } } case *andLineMatchTree: - visitMatches(&s.andMatchTree, known, f) + visitMatches(&s.andMatchTree, known, weight, f) case *orMatchTree: for _, ch := range s.children { if known[ch] { - visitMatches(ch, known, f) + visitMatches(ch, known, weight, f) } } + case *boostMatchTree: + visitMatches(s.child, known, weight*s.weight, f) case *symbolSubstrMatchTree: - visitMatches(s.substrMatchTree, known, f) + visitMatches(s.substrMatchTree, known, weight, f) case *notMatchTree: case *noVisitMatchTree: // don't collect into negative trees. case *fileNameMatchTree: // We will just gather the filename if we do not visit this tree. default: - f(s) + f(s, weight) } } @@ -876,6 +903,10 @@ func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[mat return evalMatchTree(cp, cost, known, t.child) } +func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { + return evalMatchTree(cp, cost, known, t.child) +} + func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { if t.contEvaluated { return matchesStateForSlice(t.current) @@ -997,6 +1028,17 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) child: ct, }, nil + case *query.Boost: + ct, err := d.newMatchTree(s.Child, opt) + if err != nil { + return nil, err + } + + return &boostMatchTree{ + child: ct, + weight: s.Weight, + }, nil + case *query.Substring: return d.newSubstringMatchTree(s) @@ -1288,6 +1330,8 @@ func pruneMatchTree(mt matchTree) (matchTree, error) { } case *fileNameMatchTree: mt.child, err = pruneMatchTree(mt.child) + case *boostMatchTree: + mt.child, err = pruneMatchTree(mt.child) case *andLineMatchTree: child, err := pruneMatchTree(&mt.andMatchTree) if err != nil { diff --git a/query/boost.go b/query/boost.go new file mode 100644 index 000000000..3b3cd8a31 --- /dev/null +++ b/query/boost.go @@ -0,0 +1,61 @@ +package query + +type ExperimentalPhraseBoostOptions struct { + // The phrase needs to contain atleast this many terms. This is based on the + // parsed query. + // + // Defaults to 3. + MinTerms int + + // BoostWeight is how much to multiply the phrase match scores by. + // + // Defaults to 20. + BoostWeight float64 +} + +// ExperimentalPhraseBoost transforms q into a query containing exact matches +// to phrase boosted. opts control how and when the boosting is done. +// +// Note: This is a temporary API and will be removed in future commits. +func ExpirementalPhraseBoost(q Q, phrase string, opts ExperimentalPhraseBoostOptions) Q { + if opts.MinTerms == 0 { + opts.MinTerms = 3 + } + if opts.BoostWeight == 0 { + opts.BoostWeight = 20 + } + + contentAtoms := 0 + caseSensitive := false + VisitAtoms(q, func(q Q) { + switch s := q.(type) { + case *Regexp: + // Check atom is for content + if s.Content || !(s.Content && s.FileName) { + caseSensitive = s.CaseSensitive + contentAtoms++ + } + case *Substring: + if s.Content || !(s.Content && s.FileName) { + caseSensitive = s.CaseSensitive + contentAtoms++ + } + } + }) + + if contentAtoms < opts.MinTerms { + return q + } + + return NewOr( + &Boost{ + Weight: opts.BoostWeight, + Child: &Substring{ + Pattern: phrase, + Content: true, + CaseSensitive: caseSensitive, + }, + }, + q, + ) +} diff --git a/query/query.go b/query/query.go index 6a8c5dd3a..ce17e301a 100644 --- a/query/query.go +++ b/query/query.go @@ -386,6 +386,19 @@ func (q *Type) String() string { } } +// Boost scales the contribution to score of descendents. +type Boost struct { + Child Q + // Weight will multiply the score of its descendents. Weights less than 1 + // will give less importance while values greater than 1 will give more + // importance. + Weight float64 +} + +func (q *Boost) String() string { + return fmt.Sprintf("(boost %f %s)", q.Weight, q.Child) +} + // Substring is the most basic query: a query for a substring. type Substring struct { Pattern string @@ -609,6 +622,9 @@ func flatten(q Q) (Q, bool) { case *Type: child, changed := flatten(s.Child) return &Type{Child: child, Type: s.Type}, changed + case *Boost: + child, changed := flatten(s.Child) + return &Boost{Child: child, Weight: s.Weight}, changed default: return q, false } @@ -680,6 +696,8 @@ func evalConstants(q Q) Q { return ch } return &Type{Child: ch, Type: s.Type} + case *Boost: + return &Boost{Weight: s.Weight, Child: evalConstants(s.Child)} case *Substring: if len(s.Pattern) == 0 { return &Const{true} @@ -728,6 +746,8 @@ func Map(q Q, f func(q Q) Q) Q { q = &Not{Child: Map(s.Child, f)} case *Type: q = &Type{Type: s.Type, Child: Map(s.Child, f)} + case *Boost: + q = &Boost{Weight: s.Weight, Child: Map(s.Child, f)} } return f(q) } @@ -768,6 +788,7 @@ func VisitAtoms(q Q, v func(q Q)) { case *Or: case *Not: case *Type: + case *Boost: default: v(iQ) } diff --git a/web/server.go b/web/server.go index 5fc7e0fb7..067acb918 100644 --- a/web/server.go +++ b/web/server.go @@ -241,6 +241,13 @@ func (s *Server) serveSearchErr(r *http.Request) (*ApiSearchResult, error) { return nil, err } + // Experimental: The query string and boost exact phrases of it. + if phraseBoost, err := strconv.ParseFloat(qvals.Get("phrase-boost"), 64); err == nil { + q = query.ExpirementalPhraseBoost(q, queryStr, query.ExperimentalPhraseBoostOptions{ + BoostWeight: phraseBoost, + }) + } + repoOnly := true query.VisitAtoms(q, func(q query.Q) { _, ok := q.(*query.Repo)