diff --git a/build/builder.go b/build/builder.go index eae80d14..82aba3be 100644 --- a/build/builder.go +++ b/build/builder.go @@ -22,7 +22,6 @@ import ( "fmt" "io" "log" - "math" "net/url" "os" "os/exec" @@ -247,11 +246,11 @@ type Builder struct { nextShardNum int todo []*zoekt.Document + docChecker zoekt.DocChecker size int parserMap ctags.ParserMap - - building sync.WaitGroup + building sync.WaitGroup errMu sync.Mutex buildError error @@ -618,20 +617,13 @@ func (b *Builder) Add(doc zoekt.Document) error { } allowLargeFile := b.opts.IgnoreSizeMax(doc.Name) - - // Adjust trigramMax for allowed large files so we don't exclude them. - trigramMax := b.opts.TrigramMax - if allowLargeFile { - trigramMax = math.MaxInt64 - } - if len(doc.Content) > b.opts.SizeMax && !allowLargeFile { // We could pass the document on to the shardbuilder, but if // we pass through a part of the source tree with binary/large // files, the corresponding shard would be mostly empty, so // insert a reason here too. doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) - } else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil { + } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { doc.SkipReason = err.Error() doc.Language = "binary" } diff --git a/index_test.go b/index_test.go index aa4cf228..13adcf7d 100644 --- a/index_test.go +++ b/index_test.go @@ -3204,15 +3204,30 @@ func TestSkipInvalidContent(t *testing.T) { } } -func TestCheckText(t *testing.T) { +func TestDocChecker(t *testing.T) { + docChecker := DocChecker{} + + // Test valid and invalid text for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { - if err := CheckText([]byte(text), 20000); err != nil { - t.Errorf("CheckText(%q): %v", text, err) + if err := docChecker.Check([]byte(text), 20000, false); err != nil { + t.Errorf("Check(%q): %v", text, err) } } for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { - if err := CheckText([]byte(text), 15); err == nil { - t.Errorf("CheckText(%q) succeeded", text) + if err := docChecker.Check([]byte(text), 15, false); err == nil { + t.Errorf("Check(%q) succeeded", text) + } + } + + // Test valid and invalid text with an allowed large file + for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { + if err := docChecker.Check([]byte(text), 15, true); err != nil { + t.Errorf("Check(%q): %v", text, err) + } + } + for _, text := range []string{"zero\x00byte", "xx"} { + if err := docChecker.Check([]byte(text), 15, true); err == nil { + t.Errorf("Check(%q) succeeded", text) } } } diff --git a/indexbuilder.go b/indexbuilder.go index 9e65f25c..67773c6d 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -338,51 +338,6 @@ func (b *IndexBuilder) AddFile(name string, content []byte) error { return b.Add(Document{Name: name, Content: content}) } -// CheckText returns a reason why the given contents are probably not source texts. -func CheckText(content []byte, maxTrigramCount int) error { - if len(content) == 0 { - return nil - } - - if len(content) < ngramSize { - return fmt.Errorf("file size smaller than %d", ngramSize) - } - - // PERF: we only need to do the trigram check if the upperbound on content - // is greater than our threshold. - var trigrams map[ngram]struct{} - if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount { - trigrams = make(map[ngram]struct{}, maxTrigramCount+1) - } - - var cur [3]rune - byteCount := 0 - for len(content) > 0 { - if content[0] == 0 { - return fmt.Errorf("binary data at byte offset %d", byteCount) - } - - r, sz := utf8.DecodeRune(content) - content = content[sz:] - byteCount += sz - - cur[0], cur[1], cur[2] = cur[1], cur[2], r - if cur[0] == 0 { - // start of file. - continue - } - - if trigrams != nil { - trigrams[runesToNGram(cur)] = struct{}{} - if len(trigrams) > maxTrigramCount { - // probably not text. - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) - } - } - } - return nil -} - func (b *IndexBuilder) populateSubRepoIndices() error { if len(b.subRepoIndices) == len(b.repoList) { return nil @@ -559,3 +514,61 @@ func (b *IndexBuilder) branchMask(br string) uint64 { } return 0 } + +type DocChecker struct { + // A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations. + trigrams map[ngram]struct{} +} + +// Check returns a reason why the given contents are probably not source texts. +func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { + if len(content) == 0 { + return nil + } + + if len(content) < ngramSize { + return fmt.Errorf("file size smaller than %d", ngramSize) + } + + if index := bytes.IndexByte(content, 0); index > 0 { + return fmt.Errorf("binary data at byte offset %d", index) + } + + // PERF: we only need to do the trigram check if the upperbound on content is greater than + // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. + if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { + return nil + } + + var cur [3]rune + byteCount := 0 + t.clearTrigrams(maxTrigramCount) + + for len(content) > 0 { + r, sz := utf8.DecodeRune(content) + content = content[sz:] + byteCount += sz + + cur[0], cur[1], cur[2] = cur[1], cur[2], r + if cur[0] == 0 { + // start of file. + continue + } + + t.trigrams[runesToNGram(cur)] = struct{}{} + if len(t.trigrams) > maxTrigramCount { + // probably not text. + return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) + } + } + return nil +} + +func (t *DocChecker) clearTrigrams(maxTrigramCount int) { + if t.trigrams == nil { + t.trigrams = make(map[ngram]struct{}, maxTrigramCount) + } + for key := range t.trigrams { + delete(t.trigrams, key) + } +} diff --git a/install-ctags-alpine.sh b/install-ctags-alpine.sh index 963e269b..eb9a3d9a 100755 --- a/install-ctags-alpine.sh +++ b/install-ctags-alpine.sh @@ -3,5 +3,5 @@ set -eux # shellcheck disable=SC3040 set -o pipefail || true # Commit from 2023-10-24. Please always pick a commit from the main branch. -export SOURCEGRAPH_COMMIT=4dd4ce3d91da5cac2ac6169d3005714247178f57 +export SOURCEGRAPH_COMMIT=45a6748bb491513b9e1162d888711ca9b3bb4303 wget -O - https://raw.githubusercontent.com/sourcegraph/sourcegraph/$SOURCEGRAPH_COMMIT/cmd/symbols/ctags-install-alpine.sh | sh