From cfb7fe3f4d596c3da10e2992dea2d495143e3dfb Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 9 Jan 2024 12:27:38 +0200 Subject: [PATCH 1/3] factor out column calculation in chunk matches into helper This doesn't change the logic, it just moves it into a struct so I can make it smarter. Additionally we add a benchmark and test in this commit. The next commit will contain the perf improvement. --- contentprovider.go | 21 +++++++++-- contentprovider_test.go | 80 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index b1fe0ab8..dd7eb8fd 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -293,6 +293,7 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte chunks := chunkCandidates(ms, newlines, numContextLines) data := p.data(false) chunkMatches := make([]ChunkMatch, 0, len(chunks)) + columnHelper := columnHelper{data: data} for _, chunk := range chunks { ranges := make([]Range, 0, len(chunk.candidates)) var symbolInfo []*Symbol @@ -306,12 +307,12 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte Start: Location{ ByteOffset: startOffset, LineNumber: uint32(startLine), - Column: uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1), + Column: columnHelper.get(startLineOffset, startOffset), }, End: Location{ ByteOffset: endOffset, LineNumber: uint32(endLine), - Column: uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1), + Column: columnHelper.get(endLineOffset, endOffset), }, }) @@ -392,6 +393,22 @@ func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines in return chunks } +// columnHelper is a helper struct which caches the number of runes last +// counted. If we naively use utf8.RuneCount for each match on a line, this +// leads to an O(nm) algorithm where m is the number of matches and n is the +// length of the line. Aassuming we our candidates are increasing in offset +// makes this operation O(n) instead. +type columnHelper struct { + data []byte +} + +// get returns the line column for offset. offset is the byte offset of the +// rune in data. lineOffset is the byte offset inside of data for the line +// containing offset. +func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { + return uint32(utf8.RuneCount(c.data[lineOffset:offset]) + 1) +} + type newlines struct { // locs is the sorted set of byte offsets of the newlines in the file locs []uint32 diff --git a/contentprovider_test.go b/contentprovider_test.go index deb31b09..d811f788 100644 --- a/contentprovider_test.go +++ b/contentprovider_test.go @@ -4,6 +4,8 @@ import ( "bytes" "fmt" "testing" + "testing/quick" + "unicode/utf8" "github.com/google/go-cmp/cmp" ) @@ -327,3 +329,81 @@ func TestChunkMatches(t *testing.T) { }) } } + +func BenchmarkColumnHelper(b *testing.B) { + // We simulate looking up columns of evenly spaced matches + const matches = 10_000 + const match = "match" + const space = " " + const dist = uint32(len(match) + len(space)) + data := bytes.Repeat([]byte(match+space), matches) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + columnHelper := columnHelper{data: data} + + lineOffset := 0 + offset := uint32(0) + for offset < uint32(len(data)) { + col := columnHelper.get(lineOffset, offset) + if col != offset+1 { + b.Fatal("column is not offset even though data is ASCII") + } + offset += dist + } + } +} + +func TestColumnHelper(t *testing.T) { + f := func(line0, line1 string) bool { + data := []byte(line0 + line1) + lineOffset := len(line0) + + columnHelper := columnHelper{data: data} + + // We check every second rune returns the correct answer + offset := lineOffset + column := 1 + for offset < len(data) { + if column%2 == 0 { + got := columnHelper.get(lineOffset, uint32(offset)) + if got != uint32(column) { + return false + } + } + _, size := utf8.DecodeRune(data[offset:]) + offset += size + column++ + } + + return true + } + + if err := quick.Check(f, nil); err != nil { + t.Fatal(err) + } + + // Corner cases + + // empty data, shouldn't happen but just in case it slips through + ch := columnHelper{data: nil} + if got := ch.get(0, 0); got != 1 { + t.Fatal("empty data didn't return 1", got) + } + + // Repeating a call to get should return the same value + // empty data, shouldn't happen but just in case it slips through + ch = columnHelper{data: []byte("hello\nworld")} + if got := ch.get(6, 8); got != 3 { + t.Fatal("unexpected value for third column on second line", got) + } + if got := ch.get(6, 8); got != 3 { + t.Fatal("unexpected value for repeated call for third column on second line", got) + } + + // Now make sure if we go backwards we do not incorrectly use the cache + if got := ch.get(6, 6); got != 1 { + t.Fatal("unexpected value for backwards call for first column on second line", got) + } +} From 784a3fc7cde87b004e12e3a4861ac0456d0331df Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Tue, 9 Jan 2024 12:43:44 +0200 Subject: [PATCH 2/3] reuse position of last column value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O(nm) to O(n): benchstat before.txt after.txt name old time/op new time/op delta ColumnHelper-32 299ms ± 2% 0ms ± 2% -99.97% (p=0.000 n=10+10) --- contentprovider.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/contentprovider.go b/contentprovider.go index dd7eb8fd..7f5ce9f0 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -400,13 +400,32 @@ func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines in // makes this operation O(n) instead. type columnHelper struct { data []byte + + // 0 values for all these are valid values + lastLineOffset int + lastOffset uint32 + lastRuneCount uint32 } // get returns the line column for offset. offset is the byte offset of the // rune in data. lineOffset is the byte offset inside of data for the line // containing offset. func (c *columnHelper) get(lineOffset int, offset uint32) uint32 { - return uint32(utf8.RuneCount(c.data[lineOffset:offset]) + 1) + var runeCount uint32 + + if lineOffset == c.lastLineOffset && offset >= c.lastOffset { + // Can count from last calculation + runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset])) + } else { + // Need to count from the beginning of line + runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset])) + } + + c.lastLineOffset = lineOffset + c.lastOffset = offset + c.lastRuneCount = runeCount + + return runeCount + 1 } type newlines struct { From c72530e9ef2a448aaa71ca20c3b62a5a8a765fc4 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 10 Jan 2024 11:41:14 +0200 Subject: [PATCH 3/3] warn if not sorted --- contentprovider.go | 24 ++++++++++++++++++++++-- eval.go | 3 +++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index 7f5ce9f0..68c41cfe 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -169,6 +169,12 @@ func (p *contentProvider) fillMatches(ms []*candidateMatch, numContextLines int, return result } +// fillChunkMatches converts the internal candidateMatch slice into our APIs ChunkMatch. +// +// Performance invariant: ms is sorted and non-overlapping. +// +// Note: the byte slices may be backed by mmapped data, so before being +// returned by the API it needs to be copied. func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines int, language string, debug bool) []ChunkMatch { var result []ChunkMatch if ms[0].fileName { @@ -290,10 +296,21 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numContextLines int) []ChunkMatch { newlines := p.newlines() - chunks := chunkCandidates(ms, newlines, numContextLines) data := p.data(false) - chunkMatches := make([]ChunkMatch, 0, len(chunks)) + + // columnHelper prevents O(len(ms) * len(data)) lookups for all columns. + // However, it depends on ms being sorted by byteOffset and non-overlapping. + // This invariant is true at the time of writing, but we conservatively + // enforce this. Note: chunkCandidates preserves the sorting so safe to + // transform now. columnHelper := columnHelper{data: data} + if !sort.IsSorted((sortByOffsetSlice)(ms)) { + log.Printf("WARN: performance invariant violated. candidate matches are not sorted in fillContentChunkMatches. Report to developers.") + sort.Sort((sortByOffsetSlice)(ms)) + } + + chunks := chunkCandidates(ms, newlines, numContextLines) + chunkMatches := make([]ChunkMatch, 0, len(chunks)) for _, chunk := range chunks { ranges := make([]Range, 0, len(chunk.candidates)) var symbolInfo []*Symbol @@ -362,6 +379,9 @@ type candidateChunk struct { // chunkCandidates groups a set of sorted, non-overlapping candidate matches by line number. Adjacent // chunks will be merged if adding `numContextLines` to the beginning and end of the chunk would cause // it to overlap with an adjacent chunk. +// +// input invariants: ms is sorted by byteOffset and is non overlapping with respect to endOffset. +// output invariants: if you flatten candidates the input invariant is retained. func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines int) []candidateChunk { var chunks []candidateChunk for _, m := range ms { diff --git a/eval.go b/eval.go index 6d938c63..902d0618 100644 --- a/eval.go +++ b/eval.go @@ -332,6 +332,9 @@ nextFileMatch: } } + // Important invariant for performance: finalCands is sorted by offset and + // non-overlapping. gatherMatches respects this invariant and all later + // transformations respect this. shouldMergeMatches := !opts.ChunkMatches finalCands := gatherMatches(mt, known, shouldMergeMatches)