From 197d1339ca6072d71641d73c07f72e88365eb3e4 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Wed, 17 Jan 2024 17:00:16 +0200 Subject: [PATCH] contentprovider: cache last newlines lookup I wanted to introduce another call to newlines.atOffset for the purpose of scoring. I was worried about doubling the calls to this so introduced a tiny (but hopefully effective) performance optimization. Essentially we often look up multiple times on the same line, but then the return of atOffset shouldn't change. So we introduce a tiny cache for this. We could do even more and if we are not on the same line, use the last computed index to restrict which side of the nls.locs we look at. However, there are no direct benchmarks for this and I haven't seen it in profiles before. So just doing this minor optimization for fun. Test Plan: go test --- contentprovider.go | 32 ++++++++++++++++++++++++-------- matchtree.go | 4 +++- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index 0eea77f93..ba8119011 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -216,10 +216,11 @@ func (p *contentProvider) fillChunkMatches(ms []*candidateMatch, numContextLines } func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLines int) []LineMatch { + newlines := p.newlines() var result []LineMatch for len(ms) > 0 { m := ms[0] - num, lineStart, lineEnd := p.newlines().atOffset(m.byteOffset) + num, lineStart, lineEnd := newlines.atOffset(m.byteOffset) var lineCands []*candidateMatch @@ -267,8 +268,8 @@ func (p *contentProvider) fillContentMatches(ms []*candidateMatch, numContextLin finalMatch.Line = data[lineStart:lineEnd] if numContextLines > 0 { - finalMatch.Before = p.newlines().getLines(data, num-numContextLines, num) - finalMatch.After = p.newlines().getLines(data, num+1, num+1+numContextLines) + finalMatch.Before = newlines.getLines(data, num-numContextLines, num) + finalMatch.After = newlines.getLines(data, num+1, num+1+numContextLines) } for _, m := range lineCands { @@ -455,6 +456,12 @@ type newlines struct { // on this struct so we can safely know the length of the last line // in the file since not all files end in a newline. fileSize uint32 + + // PERF: These store the last computed value for atOffset. This is valuable + // since we often repeatedly look up these values for items on the same + // line. + lastLineNumber int + lastLineStart, lastLineEnd uint32 } // atOffset returns the line containing the offset. If the offset lands on @@ -462,20 +469,29 @@ type newlines struct { // by its linenumber (base-1, byte index of line start, byte index of // line end). The line end is the index of a newline, or the filesize // (if matching the last line of the file.) -func (nls newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { +func (nls *newlines) atOffset(offset uint32) (lineNumber, lineStart, lineEnd int) { + // Use cache if we are on the same line as the last call + if offset >= nls.lastLineStart && offset < nls.lastLineEnd { + return nls.lastLineNumber, int(nls.lastLineStart), int(nls.lastLineEnd) + } + idx := sort.Search(len(nls.locs), func(n int) bool { return nls.locs[n] >= offset }) - start, end := nls.lineBounds(idx + 1) - return idx + 1, int(start), int(end) + number := idx + 1 + start, end := nls.lineBounds(number) + + nls.lastLineNumber, nls.lastLineStart, nls.lastLineEnd = number, start, end + + return number, int(start), int(end) } // lineBounds returns the byte offsets of the start and end of the 1-based // lineNumber. The end offset is exclusive and will not contain the line-ending // newline. If the line number is out of range of the lines in the file, start // and end will be clamped to [0,fileSize]. -func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { +func (nls *newlines) lineBounds(lineNumber int) (start, end uint32) { // nls.locs[0] + 1 is the start of the 2nd line of data. startIdx := lineNumber - 2 endIdx := lineNumber - 1 @@ -501,7 +517,7 @@ func (nls newlines) lineBounds(lineNumber int) (start, end uint32) { // getLines returns a slice of data containing the lines [low, high). // low is 1-based and inclusive. high is 1-based and exclusive. -func (nls newlines) getLines(data []byte, low, high int) []byte { +func (nls *newlines) getLines(data []byte, low, high int) []byte { if low >= high { return nil } diff --git a/matchtree.go b/matchtree.go index 102e0bc26..3fb998a6b 100644 --- a/matchtree.go +++ b/matchtree.go @@ -642,6 +642,8 @@ func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matc } } + newlines := cp.newlines() + type lineRange struct { start int end int @@ -649,7 +651,7 @@ func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matc lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current)) prev := -1 for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current { - line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset) + line, byteStart, byteEnd := newlines.atOffset(candidate.byteOffset) if line == prev { continue }