Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Commit

Permalink
Searcher: improve how matches are built (#59527)
Browse files Browse the repository at this point in the history
This PR improves how searcher creates matches, making it more consistent with
how it's done in Zoekt.

Changes:
* Pull chunking logic out of structural search code and into its own file
`chunk.go`
* Remove overlapping ranges (this is what Zoekt does when chunk matches are
enabled)
* Optimize the column calculation using the same strategy from Zoekt ([zoekt#711](sourcegraph/zoekt#711))
  • Loading branch information
jtibshirani authored Jan 12, 2024
1 parent 6dd737f commit b5937ed
Show file tree
Hide file tree
Showing 9 changed files with 611 additions and 466 deletions.
2 changes: 2 additions & 0 deletions cmd/searcher/internal/search/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "search",
srcs = [
"chunk.go",
"filter.go",
"hybrid.go",
"matchtree.go",
Expand Down Expand Up @@ -110,6 +111,7 @@ go_test(
name = "search_test",
timeout = "short",
srcs = [
"chunk_test.go",
"filter_test.go",
"github_archive_test.go",
"hybrid_test.go",
Expand Down
195 changes: 195 additions & 0 deletions cmd/searcher/internal/search/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package search

import (
"bytes"
"sort"
"unicode/utf8"

"github.com/sourcegraph/sourcegraph/cmd/searcher/protocol"
)

// chunkRanges groups a set of ranges into chunks of adjacent ranges.
//
// `interChunkLines` is the minimum number of lines allowed between chunks. If
// two chunks would have fewer than `interChunkLines` lines between them, they
// are instead merged into a single chunk. For example, calling `chunkRanges`
// with `interChunkLines == 0` means ranges on two adjacent lines would be
// returned as two separate chunks.
//
// This function guarantees that the chunks returned are ordered by line number,
// have no overlapping lines, and the line ranges covered are spaced apart by
// a minimum of `interChunkLines`. More precisely, for any return value `rangeChunks`:
// rangeChunks[i].cover.End.Line + interChunkLines < rangeChunks[i+1].cover.Start.Line
func chunkRanges(ranges []protocol.Range, interChunkLines int32) []rangeChunk {
// Sort by range start
sort.Slice(ranges, func(i, j int) bool {
return ranges[i].Start.Offset < ranges[j].Start.Offset
})

// guestimate size to minimize allocations. This assumes ~2 matches per
// chunk. Additionally, since allocations are doubled on realloc, this
// should only realloc once for small ranges.
chunks := make([]rangeChunk, 0, len(ranges)/2)
for i, rr := range ranges {
if i == 0 {
// First iteration, there are no chunks, so create a new one
chunks = append(chunks, rangeChunk{
cover: rr,
ranges: ranges[:1],
})
continue
}

lastChunk := &chunks[len(chunks)-1] // pointer for mutability
if lastChunk.cover.End.Line+interChunkLines >= rr.Start.Line {
// The current range overlaps with the current chunk, so merge them
lastChunk.ranges = ranges[i-len(lastChunk.ranges) : i+1]

// Expand the chunk coverRange if needed
if rr.End.Offset > lastChunk.cover.End.Offset {
lastChunk.cover.End = rr.End
}
} else {
// No overlap, so create a new chunk
chunks = append(chunks, rangeChunk{
cover: rr,
ranges: ranges[i : i+1],
})
}
}
return chunks
}

func chunksToMatches(buf []byte, chunks []rangeChunk, contextLines int32) []protocol.ChunkMatch {
chunkMatches := make([]protocol.ChunkMatch, 0, len(chunks))
for _, chunk := range chunks {
extendedRange := extendRangeToLines(chunk.cover, buf)
rangeWithContext := addContextLines(extendedRange, buf, contextLines)
chunkMatches = append(chunkMatches, protocol.ChunkMatch{
// NOTE: we must copy the content here because the reference
// must not outlive the backing mmap, which may be cleaned
// up before the match is serialized for the network.
Content: string(bytes.ToValidUTF8(buf[rangeWithContext.Start.Offset:rangeWithContext.End.Offset], []byte("�"))),
ContentStart: rangeWithContext.Start,
Ranges: chunk.ranges,
})
}
return chunkMatches
}

// extendRangeWithContext adds contextLines worth of context to the range.
func extendRangeToLines(inputRange protocol.Range, buf []byte) protocol.Range {
firstLineStart := lineStart(buf, inputRange.Start.Offset)
lastLineStart := lineStart(buf, inputRange.End.Offset)
lastLineEnd := lineEnd(buf, inputRange.End.Offset)

return protocol.Range{
Start: protocol.Location{
Offset: firstLineStart,
Line: inputRange.Start.Line,
Column: 0,
},
End: protocol.Location{
Offset: lastLineEnd,
Line: inputRange.End.Line,
Column: int32(utf8.RuneCount(buf[lastLineStart:lastLineEnd])),
},
}
}

func addContextLines(inputRange protocol.Range, buf []byte, contextLines int32) protocol.Range {
if contextLines == 0 {
return inputRange
}
firstLineStart := inputRange.Start.Offset
lastLineEnd := inputRange.End.Offset

precedingLinesAdded := 0
succeedingLinesAdded := 0

for i := int32(0); i < contextLines; i++ {
if firstLineStart > 0 {
firstLineStart = lineStart(buf, firstLineStart-1)
precedingLinesAdded += 1
}

rest := buf[lastLineEnd:]
if bytes.HasPrefix(rest, []byte("\n")) && len(rest) > 1 {
lastLineEnd = lineEnd(buf, lastLineEnd+1)
succeedingLinesAdded += 1
} else if bytes.HasPrefix(rest, []byte("\r\n")) && len(rest) > 2 {
lastLineEnd = lineEnd(buf, lastLineEnd+2)
succeedingLinesAdded += 1
}
}

lastLineStart := lineStart(buf, lastLineEnd)

return protocol.Range{
Start: protocol.Location{
Offset: firstLineStart,
Line: inputRange.Start.Line - int32(precedingLinesAdded),
Column: 0,
},
End: protocol.Location{
Offset: lastLineEnd,
Line: inputRange.End.Line + int32(succeedingLinesAdded),
Column: int32(utf8.RuneCount(buf[lastLineStart:lastLineEnd])),
},
}
}

func lineStart(buf []byte, offset int32) int32 {
start := int32(0)
if loc := bytes.LastIndexByte(buf[:offset], '\n'); loc >= 0 {
start = int32(loc) + 1
}
return start
}

func lineEnd(buf []byte, offset int32) int32 {
end := int32(len(buf))
if loc := bytes.IndexByte(buf[offset:], '\n'); loc >= 0 {
end = int32(loc) + offset
if bytes.HasSuffix(buf[:end], []byte("\r")) {
end -= 1
}
}
return end
}

// columnHelper is a helper struct which caches the number of runes last
// counted. If we naively use utf8.RuneCount for each match on a line, this
// leads to an O(nm) algorithm where m is the number of matches and n is the
// length of the line. Since the matches are sorted by increasing offset, we
// can avoid searching through the part of the line already processed, which
// makes this operation O(n) instead.
type columnHelper struct {
data []byte

// 0 values for all these are valid values
lastLineOffset int
lastOffset int
lastRuneCount int
}

// get returns the column for the match. 'lineOffset' is the byte offset for the
// start of the line in the data buffer, and 'offset' is the byte offset of the
// rune in data.
func (c *columnHelper) get(lineOffset int, offset int) int {
var runeCount int

if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
// Can count from last calculation
runeCount = c.lastRuneCount + utf8.RuneCount(c.data[c.lastOffset:offset])
} else {
// Need to count from the beginning of line
runeCount = utf8.RuneCount(c.data[lineOffset:offset])
}

c.lastLineOffset = lineOffset
c.lastOffset = offset
c.lastRuneCount = runeCount

return runeCount
}
Loading

0 comments on commit b5937ed

Please sign in to comment.