sourcegraph · keegancsmith · Jan 10, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 10, 2024
diff --git a/contentprovider.go b/contentprovider.go
@@ -293,6 +293,7 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
 	chunks := chunkCandidates(ms, newlines, numContextLines)
 	data := p.data(false)
 	chunkMatches := make([]ChunkMatch, 0, len(chunks))
+	columnHelper := columnHelper{data: data}
 	for _, chunk := range chunks {
 		ranges := make([]Range, 0, len(chunk.candidates))
 		var symbolInfo []*Symbol
@@ -306,12 +307,12 @@ func (p *contentProvider) fillContentChunkMatches(ms []*candidateMatch, numConte
 				Start: Location{
 					ByteOffset: startOffset,
 					LineNumber: uint32(startLine),
-					Column:     uint32(utf8.RuneCount(data[startLineOffset:startOffset]) + 1),
+					Column:     columnHelper.get(startLineOffset, startOffset),
 				},
 				End: Location{
 					ByteOffset: endOffset,
 					LineNumber: uint32(endLine),
-					Column:     uint32(utf8.RuneCount(data[endLineOffset:endOffset]) + 1),
+					Column:     columnHelper.get(endLineOffset, endOffset),
 				},
 			})
 
@@ -392,6 +393,41 @@ func chunkCandidates(ms []*candidateMatch, newlines newlines, numContextLines in
 	return chunks
 }
 
+// columnHelper is a helper struct which caches the number of runes last
+// counted. If we naively use utf8.RuneCount for each match on a line, this
+// leads to an O(nm) algorithm where m is the number of matches and n is the
+// length of the line. Aassuming we our candidates are increasing in offset
-// length of the line. Aassuming we our candidates are increasing in offset
+// length of the line. Assuming we our candidates are increasing in offset
-// length of the line. Aassuming we our candidates are increasing in offset
+// length of the line. Assuming we our candidates are increasing in offset
+// makes this operation O(n) instead.
+type columnHelper struct {
+	data []byte
+
+	// 0 values for all these are valid values
+	lastLineOffset int
+	lastOffset     uint32
+	lastRuneCount  uint32
+}
+
+// get returns the line column for offset. offset is the byte offset of the
+// rune in data. lineOffset is the byte offset inside of data for the line
+// containing offset.
+func (c *columnHelper) get(lineOffset int, offset uint32) uint32 {
+	var runeCount uint32
+
+	if lineOffset == c.lastLineOffset && offset >= c.lastOffset {
+		// Can count from last calculation
+		runeCount = c.lastRuneCount + uint32(utf8.RuneCount(c.data[c.lastOffset:offset]))
+	} else {
+		// Need to count from the beginning of line
+		runeCount = uint32(utf8.RuneCount(c.data[lineOffset:offset]))
+	}
+
+	c.lastLineOffset = lineOffset
+	c.lastOffset = offset
+	c.lastRuneCount = runeCount
+
+	return runeCount + 1
+}
+
 type newlines struct {
 	// locs is the sorted set of byte offsets of the newlines in the file
 	locs []uint32

diff --git a/contentprovider_test.go b/contentprovider_test.go
@@ -4,6 +4,8 @@ import (
 	"bytes"
 	"fmt"
 	"testing"
+	"testing/quick"
+	"unicode/utf8"
 
 	"github.com/google/go-cmp/cmp"
 )
@@ -327,3 +329,81 @@ func TestChunkMatches(t *testing.T) {
 		})
 	}
 }
+
+func BenchmarkColumnHelper(b *testing.B) {
+	// We simulate looking up columns of evenly spaced matches
+	const matches = 10_000
+	const match = "match"
+	const space = "         "
+	const dist = uint32(len(match) + len(space))
+	data := bytes.Repeat([]byte(match+space), matches)
+
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		columnHelper := columnHelper{data: data}
+
+		lineOffset := 0
+		offset := uint32(0)
+		for offset < uint32(len(data)) {
+			col := columnHelper.get(lineOffset, offset)
+			if col != offset+1 {
+				b.Fatal("column is not offset even though data is ASCII")
+			}
+			offset += dist
+		}
+	}
+}
+
+func TestColumnHelper(t *testing.T) {
+	f := func(line0, line1 string) bool {
+		data := []byte(line0 + line1)
+		lineOffset := len(line0)
+
+		columnHelper := columnHelper{data: data}
+
+		// We check every second rune returns the correct answer
+		offset := lineOffset
+		column := 1
+		for offset < len(data) {
+			if column%2 == 0 {
+				got := columnHelper.get(lineOffset, uint32(offset))
+				if got != uint32(column) {
+					return false
+				}
+			}
+			_, size := utf8.DecodeRune(data[offset:])
+			offset += size
+			column++
+		}
+
+		return true
+	}
+
+	if err := quick.Check(f, nil); err != nil {
+		t.Fatal(err)
+	}
+
+	// Corner cases
+
+	// empty data, shouldn't happen but just in case it slips through
+	ch := columnHelper{data: nil}
+	if got := ch.get(0, 0); got != 1 {
+		t.Fatal("empty data didn't return 1", got)
+	}
+
+	// Repeating a call to get should return the same value
+	// empty data, shouldn't happen but just in case it slips through
+	ch = columnHelper{data: []byte("hello\nworld")}
+	if got := ch.get(6, 8); got != 3 {
+		t.Fatal("unexpected value for third column on second line", got)
+	}
+	if got := ch.get(6, 8); got != 3 {
+		t.Fatal("unexpected value for repeated call for third column on second line", got)
+	}
+
+	// Now make sure if we go backwards we do not incorrectly use the cache
+	if got := ch.get(6, 6); got != 1 {
+		t.Fatal("unexpected value for backwards call for first column on second line", got)
+	}
+}