diff --git a/contentprovider.go b/contentprovider.go index b1fe0ab8..68c41cfe 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -169,6 +169,12 @@ func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, return result } +// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. +// +// Performance invariant: ms is sorted and non-overlapping. +// +// Note: the byte slices may be backed by mmapped data, so before being +// returned by the API it needs to be copied. func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { var result []ChunkMatch if ms[0].fileName { @@ -290,8 +296,20 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { newlines := p.newlines() - chunks := chunkCandidates(ms, newlines, numContextLines) data := p.data(false) + + // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. + // However, it depends on ms being sorted by byteOffset and non-overlapping. + // This invariant is true at the time of writing, but we conservatively + // enforce this. Note: chunkCandidates preserves the sorting so safe to + // transform now. + columnHelper := columnHelper{data: data} + if !sort.IsSorted((sortByOffsetSlice)(ms)) { + log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") + sort.Sort((sortByOffsetSlice)(ms)) + } + + chunks := chunkCandidates(ms, newlines, numContextLines) chunkMatches := make([]ChunkMatch, 0, len(chunks)) for _, chunk := range chunks { ranges := make([]Range, 0, len(chunk.candidates)) @@ -306,12 +324,12 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte Start: Location{ ByteOffset: startOffset, LineNumber: uint32(startLine), - Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), + Column: columnHelper.get(startLineOffset, startOffset), }, End: Location{ ByteOffset: endOffset, LineNumber: uint32(endLine), - Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), + Column: columnHelper.get(endLineOffset, endOffset), }, }) @@ -361,6 +379,9 @@ type candidateChunk struct { // chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent // chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause // it to overlap with an adjacent chunk. +// +// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. +// output invariants: if you flatten candidates the input invariant is retained. func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { var chunks []candidateChunk for _, m := range ms { @@ -392,6 +413,41 @@ func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines in return chunks } +// columnHelper is a helper struct which caches the number of runes last +// counted. If we naively use utf8.RuneCount for each match on a line, this +// leads to an O(nm) algorithm where m is the number of matches and n is the +// length of the line. Aassuming we our candidates are increasing in offset +// makes this operation O(n) instead. +type columnHelper struct { + data []byte + + // 0 values for all these are valid values + lastLineOffset int + lastOffset uint32 + lastRuneCount uint32 +} + +// get returns the line column for offset. offset is the byte offset of the +// rune in data. lineOffset is the byte offset inside of data for the line +// containing offset. +func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { + var runeCount uint32 + + if lineOffset == c.lastLineOffset && offset >= c.lastOffset { + // Can count from last calculation + runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) + } else { + // Need to count from the beginning of line + runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) + } + + c.lastLineOffset = lineOffset + c.lastOffset = offset + c.lastRuneCount = runeCount + + return runeCount + 1 +} + type newlines struct { // locs is the sorted set of byte offsets of the newlines in the file locs []uint32 diff --git a/contentprovider_test.go b/contentprovider_test.go index deb31b09..d811f788 100644 --- a/contentprovider_test.go +++ b/contentprovider_test.go @@ -4,6 +4,8 @@ import ( "bytes" "fmt" "testing" + "testing/quick" + "unicode/utf8" "github.com/google/go-cmp/cmp" ) @@ -327,3 +329,81 @@ func TestChunkMatches(t *testing.T) { }) } } + +func BenchmarkColumnHelper(b *testing.B) { + // We simulate looking up columns of evenly spaced matches + const matches = 10_000 + const match = "match" + const space = " " + const dist = uint32(len(match) + len(space)) + data := bytes.Repeat([]byte(match+space), matches) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + columnHelper := columnHelper{data: data} + + lineOffset := 0 + offset := uint32(0) + for offset < uint32(len(data)) { + col := columnHelper.get(lineOffset, offset) + if col != offset+1 { + b.Fatal("column is not offset even though data is ASCII") + } + offset += dist + } + } +} + +func TestColumnHelper(t *testing.T) { + f := func(line0, line1 string) bool { + data := []byte(line0 + line1) + lineOffset := len(line0) + + columnHelper := columnHelper{data: data} + + // We check every second rune returns the correct answer + offset := lineOffset + column := 1 + for offset < len(data) { + if column%2 == 0 { + got := columnHelper.get(lineOffset, uint32(offset)) + if got != uint32(column) { + return false + } + } + _, size := utf8.DecodeRune(data[offset:]) + offset += size + column++ + } + + return true + } + + if err := quick.Check(f, nil); err != nil { + t.Fatal(err) + } + + // Corner cases + + // empty data, shouldn't happen but just in case it slips through + ch := columnHelper{data: nil} + if got := ch.get(0, 0); got != 1 { + t.Fatal("empty data didn't return 1", got) + } + + // Repeating a call to get should return the same value + // empty data, shouldn't happen but just in case it slips through + ch = columnHelper{data: []byte("hello\nworld")} + if got := ch.get(6, 8); got != 3 { + t.Fatal("unexpected value for third column on second line", got) + } + if got := ch.get(6, 8); got != 3 { + t.Fatal("unexpected value for repeated call for third column on second line", got) + } + + // Now make sure if we go backwards we do not incorrectly use the cache + if got := ch.get(6, 6); got != 1 { + t.Fatal("unexpected value for backwards call for first column on second line", got) + } +} diff --git a/eval.go b/eval.go index 6d938c63..902d0618 100644 --- a/eval.go +++ b/eval.go @@ -332,6 +332,9 @@ nextFileMatch: } } + // Important invariant for performance: finalCands is sorted by offset and + // non-overlapping. gatherMatches respects this invariant and all later + // transformations respect this. shouldMergeMatches := !opts.ChunkMatches finalCands := gatherMatches(mt, known, shouldMergeMatches)