Skip to content

Commit

Permalink
Indexing: improve doc content checks (#688)
Browse files Browse the repository at this point in the history
This change makes a couple improvements to the doc content checks during indexing. When a large file is explicitly marked as "allowed", we don't enforce the max trigram count. However, we still iterated through all its trigrams and collected them in a map. Now we short-circuit the check to avoid counting all the trigrams.

We now also reuse the trigram map across documents. This makes sense, as it's always presized with the same capacity hint. This doesn't have a significant effect on indexing speed, but significantly reduces allocations
  • Loading branch information
jtibshirani authored Nov 20, 2023
1 parent e9dcbbb commit 2a4f4a4
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 61 deletions.
14 changes: 3 additions & 11 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"fmt"
"io"
"log"
"math"
"net/url"
"os"
"os/exec"
Expand Down Expand Up @@ -247,11 +246,11 @@ type Builder struct {

nextShardNum int
todo []*zoekt.Document
docChecker zoekt.DocChecker
size int

parserMap ctags.ParserMap

building sync.WaitGroup
building sync.WaitGroup

errMu sync.Mutex
buildError error
Expand Down Expand Up @@ -618,20 +617,13 @@ func (b *Builder) Add(doc zoekt.Document) error {
}

allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)

// Adjust trigramMax for allowed large files so we don't exclude them.
trigramMax := b.opts.TrigramMax
if allowLargeFile {
trigramMax = math.MaxInt64
}

if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
// We could pass the document on to the shardbuilder, but if
// we pass through a part of the source tree with binary/large
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil {
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
doc.SkipReason = err.Error()
doc.Language = "binary"
}
Expand Down
25 changes: 20 additions & 5 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3204,15 +3204,30 @@ func TestSkipInvalidContent(t *testing.T) {
}
}

func TestCheckText(t *testing.T) {
func TestDocChecker(t *testing.T) {
docChecker := DocChecker{}

// Test valid and invalid text
for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
if err := CheckText([]byte(text), 20000); err != nil {
t.Errorf("CheckText(%q): %v", text, err)
if err := docChecker.Check([]byte(text), 20000, false); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
if err := CheckText([]byte(text), 15); err == nil {
t.Errorf("CheckText(%q) succeeded", text)
if err := docChecker.Check([]byte(text), 15, false); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}

// Test valid and invalid text with an allowed large file
for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
if err := docChecker.Check([]byte(text), 15, true); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx"} {
if err := docChecker.Check([]byte(text), 15, true); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}
}
Expand Down
103 changes: 58 additions & 45 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,51 +338,6 @@ func (b *IndexBuilder) AddFile(name string, content []byte) error {
return b.Add(Document{Name: name, Content: content})
}

// CheckText returns a reason why the given contents are probably not source texts.
func CheckText(content []byte, maxTrigramCount int) error {
if len(content) == 0 {
return nil
}

if len(content) < ngramSize {
return fmt.Errorf("file size smaller than %d", ngramSize)
}

// PERF: we only need to do the trigram check if the upperbound on content
// is greater than our threshold.
var trigrams map[ngram]struct{}
if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
}

var cur [3]rune
byteCount := 0
for len(content) > 0 {
if content[0] == 0 {
return fmt.Errorf("binary data at byte offset %d", byteCount)
}

r, sz := utf8.DecodeRune(content)
content = content[sz:]
byteCount += sz

cur[0], cur[1], cur[2] = cur[1], cur[2], r
if cur[0] == 0 {
// start of file.
continue
}

if trigrams != nil {
trigrams[runesToNGram(cur)] = struct{}{}
if len(trigrams) > maxTrigramCount {
// probably not text.
return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
}
}
}
return nil
}

func (b *IndexBuilder) populateSubRepoIndices() error {
if len(b.subRepoIndices) == len(b.repoList) {
return nil
Expand Down Expand Up @@ -559,3 +514,61 @@ func (b *IndexBuilder) branchMask(br string) uint64 {
}
return 0
}

type DocChecker struct {
// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
trigrams map[ngram]struct{}
}

// Check returns a reason why the given contents are probably not source texts.
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
if len(content) == 0 {
return nil
}

if len(content) < ngramSize {
return fmt.Errorf("file size smaller than %d", ngramSize)
}

if index := bytes.IndexByte(content, 0); index > 0 {
return fmt.Errorf("binary data at byte offset %d", index)
}

// PERF: we only need to do the trigram check if the upperbound on content is greater than
// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
return nil
}

var cur [3]rune
byteCount := 0
t.clearTrigrams(maxTrigramCount)

for len(content) > 0 {
r, sz := utf8.DecodeRune(content)
content = content[sz:]
byteCount += sz

cur[0], cur[1], cur[2] = cur[1], cur[2], r
if cur[0] == 0 {
// start of file.
continue
}

t.trigrams[runesToNGram(cur)] = struct{}{}
if len(t.trigrams) > maxTrigramCount {
// probably not text.
return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
}
}
return nil
}

func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
if t.trigrams == nil {
t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
}
for key := range t.trigrams {
delete(t.trigrams, key)
}
}

0 comments on commit 2a4f4a4

Please sign in to comment.