Skip to content

Commit

Permalink
WIP boost scores based on atoms
Browse files Browse the repository at this point in the history
Early preview of this work. I am playing around with factors/etc. Also
only works for ChunkMatches at the moment.

Some things it makes worse. For the atom boost queries we added, it
improves all of those.
  • Loading branch information
keegancsmith committed Jan 12, 2024
1 parent b35d8a2 commit d32a823
Show file tree
Hide file tree
Showing 16 changed files with 221 additions and 159 deletions.
2 changes: 1 addition & 1 deletion api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ func TestMatchSize(t *testing.T) {
size: 112,
}, {
v: candidateMatch{},
size: 72,
size: 80,
}, {
v: candidateChunk{},
size: 40,
Expand Down
51 changes: 48 additions & 3 deletions contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"bytes"
"fmt"
"log"
"math/bits"
"os"
"path"
"sort"
Expand Down Expand Up @@ -205,12 +206,14 @@ func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines
FileName: true,
}}
} else {
result = p.fillContentChunkMatches(ms, numContextLines)
result = p.fillContentChunkMatches(ms, numContextLines, debug)
}

sects := p.docSections()
for i, m := range result {
result[i].Score, result[i].DebugScore = p.chunkMatchScore(sects, &m, language, debug)
score, debugScore := p.chunkMatchScore(sects, &m, language, debug)
result[i].Score += score
result[i].DebugScore += debugScore
}

return result
Expand Down Expand Up @@ -294,13 +297,48 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin
return result
}

func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch {
func scoreQueryAtoms(ms []*candidateMatch, debug bool) (score float64, what string) {
queryAtoms := uint64(0)

last := uint64(0)
run := 0
maxRun := 0
for _, cm := range ms {
queryAtoms = queryAtoms | cm.queryAtoms

// TODO handle multiple bits set in cm.queryAtoms. only happens for linematch
if last < cm.queryAtoms {
run++
if run > maxRun {
maxRun = run
}
} else if last > cm.queryAtoms {
run = 1
}
last = cm.queryAtoms
}

scoreCount := float64(bits.OnesCount64(queryAtoms)) * scoreQueryAtomsCountFactor
scoreRun := float64(maxRun) * scoreQueryAtomsRunFactor

if debug && scoreCount > 0 {
what += fmt.Sprintf("%s:%.2f, ", "queryAtomsCount", scoreCount)
}
if debug && scoreRun > 0 {
what += fmt.Sprintf("%s:%.2f, ", "queryAtomsRun", scoreRun)
}

return scoreCount + scoreRun, what
}

func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int, debug bool) []ChunkMatch {
newlines := p.newlines()
data := p.data(false)

// columnHelper prevents O(len(ms) * len(data)) lookups for all columns.
// However, it depends on ms being sorted by byteOffset and non-overlapping.
// This invariant is true at the time of writing, but we conservatively

// enforce this. Note: chunkCandidates preserves the sorting so safe to
// transform now.
columnHelper := columnHelper{data: data}
Expand Down Expand Up @@ -353,6 +391,8 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
}
firstLineStart, _ := newlines.lineBounds(firstLineNumber)

initialScore, initialWhat := scoreQueryAtoms(chunk.candidates, debug)

chunkMatches = append(chunkMatches, ChunkMatch{
Content: newlines.getLines(data, firstLineNumber, int(chunk.lastLine)+numContextLines+1),
ContentStart: Location{
Expand All @@ -363,6 +403,8 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
FileName: false,
Ranges: ranges,
SymbolInfo: symbolInfo,
Score: initialScore,
DebugScore: initialWhat,
})
}
return chunkMatches
Expand Down Expand Up @@ -545,6 +587,9 @@ const (

// Used for ordering line and chunk matches within a file.
scoreLineOrderFactor = 1.0

scoreQueryAtomsCountFactor = 500.0
scoreQueryAtomsRunFactor = 750.0
)

// findSection checks whether a section defined by offset and size lies within
Expand Down
23 changes: 19 additions & 4 deletions eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,19 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
return m[i].byteOffset < m[j].byteOffset
}

func setQueryAtom(cands []*candidateMatch, queryAtom *uint64) []*candidateMatch {
v := *queryAtom
// go spec says high bits are discarded on <<, so once we have seen 64 atoms v will become 0.
if v == 0 {
return cands
}
for _, cm := range cands {
cm.queryAtoms = v
}
*queryAtom = v << 1
return cands
}

// Gather matches from this document. This never returns a mixture of
// filename/content matches: if there are content matches, all
// filename matches are trimmed from the result. The matches are
Expand All @@ -550,18 +563,19 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
// but adjacent matches will remain.
func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
var cands []*candidateMatch
queryAtom := uint64(1)
visitMatches(mt, known, func(mt matchTree) {
if smt, ok := mt.(*substrMatchTree); ok {
cands = append(cands, smt.current...)
cands = append(cands, setQueryAtom(smt.current, &queryAtom)...)
}
if rmt, ok := mt.(*regexpMatchTree); ok {
cands = append(cands, rmt.found...)
cands = append(cands, setQueryAtom(rmt.found, &queryAtom)...)
}
if rmt, ok := mt.(*wordMatchTree); ok {
cands = append(cands, rmt.found...)
cands = append(cands, setQueryAtom(rmt.found, &queryAtom)...)
}
if smt, ok := mt.(*symbolRegexpMatchTree); ok {
cands = append(cands, smt.found...)
cands = append(cands, setQueryAtom(smt.found, &queryAtom)...)
}
})

Expand Down Expand Up @@ -597,6 +611,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
if lastEnd >= c.byteOffset {
if end > lastEnd {
last.byteMatchSz = end - last.byteOffset
last.queryAtoms = last.queryAtoms | c.queryAtoms
}
continue
}
Expand Down
20 changes: 10 additions & 10 deletions internal/e2e/testdata/Repository_metadata_Write_rbac.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ queryString: Repository metadata Write rbac
query: (and case_substr:"Repository" substr:"metadata" case_substr:"Write" substr:"rbac")
targetRank: -1

github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/repository_metadata_test.go
26:func TestRepositoryMetadata(t *testing.T) {
241: require.Equal(t, err, &rbac.ErrNotAuthorized{Permission: string(rbac.RepoMetadataWritePermission)})
254: require.Equal(t, err, &rbac.ErrNotAuthorized{Permission: string(rbac.RepoMetadataWritePermission)})
hidden 25 more line matches

github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/repository_metadata.go
54:func (r *schemaResolver) AddRepoMetadata(ctx context.Context, args struct {
95:func (r *schemaResolver) UpdateRepoMetadata(ctx context.Context, args struct {
Expand All @@ -10,8 +16,8 @@ hidden 30 more line matches

github.com/sourcegraph/sourcegraph/client/web/src/repo/tree/TreePageContent.tsx
666:interface RepositoryContributorNodeProps extends QuerySpec {
10:import { RepoMetadata } from '@sourcegraph/branded'
16:import { RepositoryType, SearchPatternType, type TreeFields } from '@sourcegraph/shared/src/graphql-operations'
53:import { canWriteRepoMetadata } from '../../util/rbac'
213: const [enableRepositoryMetadata] = useFeatureFlag('repository-metadata', true)
hidden 46 more line matches

github.com/sourcegraph/sourcegraph/doc/admin/repo/metadata.md
Expand All @@ -20,15 +26,9 @@ github.com/sourcegraph/sourcegraph/doc/admin/repo/metadata.md
8:### Repository owners
hidden 14 more line matches

github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/repository_metadata_test.go
26:func TestRepositoryMetadata(t *testing.T) {
17: "github.com/sourcegraph/sourcegraph/internal/rbac"
23: rtypes "github.com/sourcegraph/sourcegraph/internal/rbac/types"
hidden 25 more line matches

github.com/sourcegraph/sourcegraph/client/web/src/repo/repoContainerRoutes.tsx
3:import { canWriteRepoMetadata } from '../util/rbac'
5:import { RepositoryChangelistPage } from './commit/RepositoryCommitPage'
9:const RepositoryCommitPage = lazyComponent(() => import('./commit/RepositoryCommitPage'), 'RepositoryCommitPage')
24:const RepositoryMetadataPage = lazyComponent(() => import('./RepoMetadataPage'), 'RepoMetadataPage')
70: render: context => <RepositoryMetadataPage {...context} />,
hidden 19 more line matches

Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ targetRank: 1

**github.com/sourcegraph/sourcegraph/ui/assets/assets.go**
33:func (p FailingAssetsProvider) Assets() http.FileSystem {
14: Assets() http.FileSystem
1:package assets
30: return nil, errors.New("assets are not configured for this binary, please see ui/assets")
34: panic("assets are not configured for this binary, please see ui/assets")
hidden 12 more line matches

github.com/sourcegraph/sourcegraph/schema/schema.go
Expand All @@ -23,7 +23,7 @@ hidden 47 more line matches
github.com/sourcegraph/sourcegraph/doc/getting-started/github-vs-sourcegraph.md
8:## Which is best for you?
110:### Searching repositories, branches, and forks
18:As your codebase grows in complexity, the value of code search quickly increases. Sourcegraph may be a good fit for your team if:
123:**Forks** are included in the index, but they are subject to the same limitations as other repositories, so not all forks are indexed.
hidden 66 more line matches

github.com/sourcegraph/sourcegraph/doc/admin/executors/deploy_executors_terraform.md
Expand All @@ -35,7 +35,7 @@ hidden 68 more line matches
github.com/sourcegraph/sourcegraph/doc/dev/background-information/sg/reference.md
496:### sg lint format
505:### sg lint format
1:<!-- DO NOT EDIT: generated via: go generate ./dev/sg -->
726:* `--skip-upgrade-validation`: Do not attempt to compare the previous instance version with the target instance version for upgrade compatibility. Please refer to https://docs.sourcegraph.com/admin/updates#update-policy for our instance upgrade compatibility policy.
hidden 265 more line matches

hidden 3 more file matches
10 changes: 4 additions & 6 deletions internal/e2e/testdata/bufio_buffer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,14 @@ github.com/golang/go/src/cmd/doc/pkg.go
8: "bufio"
hidden 8 more line matches

github.com/golang/go/test/fixedbugs/issue5089.go
13:func (b *bufio.Reader) Buffered() int { // ERROR "non-local|redefinition"
11:import "bufio"

github.com/golang/go/src/net/http/h2_bundle.go
3716:type http2pipeBuffer interface {
1086:type http2dataBuffer struct {
3724:func (p *http2pipe) setBuffer(b http2pipeBuffer) {
hidden 116 more line matches

github.com/golang/go/src/image/png/writer.go
36:type EncoderBuffer encoder
24: BufferPool EncoderBufferPool
30:type EncoderBufferPool interface {
hidden 18 more line matches

hidden 112 more file matches
56 changes: 28 additions & 28 deletions internal/e2e/testdata/bufio_flush_writer.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
queryString: bufio flush writer
query: (and substr:"bufio" substr:"flush" substr:"writer")
targetRank: 25
targetRank: 1

github.com/golang/go/src/image/gif/writer.go
43:type writer interface {
77:func (b blockWriter) Flush() error {
123:func (e *encoder) flush() {
hidden 28 more line matches
**github.com/golang/go/src/net/http/transfer.go**
1113:type bufioFlushWriter struct{ w io.Writer }
59:type transferWriter struct {
76:func newTransferWriter(r any) (t *transferWriter, err error) {
hidden 36 more line matches

github.com/golang/go/src/image/jpeg/writer.go
211:type writer interface {
231:func (e *encoder) flush() {
212: Flush() error
hidden 11 more line matches
github.com/golang/go/src/net/http/fcgi/fcgi.go
233: *bufio.Writer
231:type bufWriter struct {
252:type streamWriter struct {
hidden 17 more line matches

github.com/golang/go/src/compress/lzw/writer.go
15:type writer interface {
36:type Writer struct {
17: Flush() error
hidden 36 more line matches
github.com/golang/go/src/cmd/internal/bio/buf.go
24: *bufio.Writer
22:type Writer struct {
34: return &Writer{f: f, Writer: bufio.NewWriter(f)}, nil
hidden 13 more line matches

github.com/golang/go/src/net/http/internal/chunked.go
240: *bufio.Writer
239:type FlushAfterChunkWriter struct {
196:type chunkedWriter struct {
hidden 25 more line matches

github.com/golang/go/src/bufio/bufio.go
579:type Writer struct {
635:func (b *Writer) Flush() error {
579:type Writer struct {
836: *Writer
hidden 72 more line matches

github.com/golang/go/src/archive/zip/writer.go
24:type Writer struct {
61:func (w *Writer) Flush() error {
607: io.Writer
hidden 55 more line matches

github.com/golang/go/src/encoding/csv/writer.go
30:type Writer struct {
123:func (w *Writer) Flush() {
37:func NewWriter(w io.Writer) *Writer {
hidden 25 more line matches
github.com/golang/go/src/image/gif/writer.go
77:func (b blockWriter) Flush() error {
43:type writer interface {
123:func (e *encoder) flush() {
hidden 28 more line matches

hidden 77 more file matches
62 changes: 31 additions & 31 deletions internal/e2e/testdata/bytes_buffer.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
queryString: bytes buffer
query: (and substr:"bytes" substr:"buffer")
targetRank: 1
targetRank: 5

**github.com/golang/go/src/bytes/buffer.go**
20:type Buffer struct {
54:func (b *Buffer) Bytes() []byte { return b.buf[b.off:] }
5:package bytes
hidden 126 more line matches
github.com/sourcegraph/sourcegraph/lib/output/block.go
49: buffer bytes.Buffer
41: for _, line := range bytes.Split(bytes.TrimRight(b.writer.buffer.Bytes(), "\n"), []byte("\n")) {
4: "bytes"
hidden 2 more line matches

github.com/golang/go/src/cmd/internal/edit/edit.go
14:type Buffer struct {
68:func (b *Buffer) Bytes() []byte {
41:func NewBuffer(data []byte) *Buffer {
hidden 13 more line matches
github.com/golang/go/src/compress/flate/huffman_bit_writer.go
82: bytes [bufferSize]byte
166:func (w *huffmanBitWriter) writeBytes(bytes []byte) {
29: bufferFlushSize = 240
hidden 63 more line matches

github.com/golang/go/src/hash/crc32/crc32_ppc64le.s
122: SLD $2,R8 // convert index-> bytes
59: MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
60: MOVWZ 4(R5),R9 // 4-7 bytes of p
hidden 35 more line matches
github.com/sourcegraph/sourcegraph/client/browser/src/types/webextension-polyfill/index.d.ts
1708: bytes?: ArrayBuffer
501: bytesReceived: number
502: totalBytes: number
hidden 9 more line matches

github.com/golang/go/src/fmt/print.go
101:type buffer []byte
509:func (p *pp) fmtBytes(v []byte, verb rune, typeString string) {
17:// Strings for use with buffer.WriteString.
hidden 28 more line matches
github.com/golang/go/src/encoding/json/decode_test.go
1784: Buffer bytes.Buffer
1777: PBuffer *bytes.Buffer // has methods, just not relevant ones
1379: ByteSlice []byte
hidden 17 more line matches

github.com/golang/go/src/bufio/scan.go
106:func (s *Scanner) Bytes() []byte {
267:func (s *Scanner) Buffer(buf []byte, max int) {
289:func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
hidden 26 more line matches
**github.com/golang/go/src/bytes/buffer.go**
54:func (b *Buffer) Bytes() []byte { return b.buf[b.off:] }
20:type Buffer struct {
5:package bytes
hidden 126 more line matches

github.com/golang/go/src/os/exec/exec.go
1134:func (w *prefixSuffixSaver) Bytes() []byte {
94: "bytes"
396: if i := bytes.Index(stack, []byte("\nos/exec.Command(")); i >= 0 {
hidden 17 more line matches
github.com/golang/go/src/cmd/doc/pkg.go
59: bytes.Buffer
56:type pkgBuffer struct {
233:var newlineBytes = []byte("\n\n") // We never ask for more than 2.
hidden 11 more line matches

hidden 494 more file matches
Loading

0 comments on commit d32a823

Please sign in to comment.