Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into jtibs/test-options
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed Nov 21, 2023
2 parents 4b837f2 + b93ad77 commit 13ed87a
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 62 deletions.
14 changes: 3 additions & 11 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"fmt"
"io"
"log"
"math"
"net/url"
"os"
"os/exec"
Expand Down Expand Up @@ -247,11 +246,11 @@ type Builder struct {

nextShardNum int
todo []*zoekt.Document
docChecker zoekt.DocChecker
size int

parserMap ctags.ParserMap

building sync.WaitGroup
building sync.WaitGroup

errMu sync.Mutex
buildError error
Expand Down Expand Up @@ -618,20 +617,13 @@ func (b *Builder) Add(doc zoekt.Document) error {
}

allowLargeFile := b.opts.IgnoreSizeMax(doc.Name)

// Adjust trigramMax for allowed large files so we don't exclude them.
trigramMax := b.opts.TrigramMax
if allowLargeFile {
trigramMax = math.MaxInt64
}

if len(doc.Content) > b.opts.SizeMax && !allowLargeFile {
// We could pass the document on to the shardbuilder, but if
// we pass through a part of the source tree with binary/large
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := zoekt.CheckText(doc.Content, trigramMax); err != nil {
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
doc.SkipReason = err.Error()
doc.Language = "binary"
}
Expand Down
25 changes: 20 additions & 5 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3204,15 +3204,30 @@ func TestSkipInvalidContent(t *testing.T) {
}
}

func TestCheckText(t *testing.T) {
func TestDocChecker(t *testing.T) {
docChecker := DocChecker{}

// Test valid and invalid text
for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
if err := CheckText([]byte(text), 20000); err != nil {
t.Errorf("CheckText(%q): %v", text, err)
if err := docChecker.Check([]byte(text), 20000, false); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
if err := CheckText([]byte(text), 15); err == nil {
t.Errorf("CheckText(%q) succeeded", text)
if err := docChecker.Check([]byte(text), 15, false); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}

// Test valid and invalid text with an allowed large file
for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
if err := docChecker.Check([]byte(text), 15, true); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx"} {
if err := docChecker.Check([]byte(text), 15, true); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}
}
Expand Down
103 changes: 58 additions & 45 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,51 +338,6 @@ func (b *IndexBuilder) AddFile(name string, content []byte) error {
return b.Add(Document{Name: name, Content: content})
}

// CheckText returns a reason why the given contents are probably not source texts.
func CheckText(content []byte, maxTrigramCount int) error {
if len(content) == 0 {
return nil
}

if len(content) < ngramSize {
return fmt.Errorf("file size smaller than %d", ngramSize)
}

// PERF: we only need to do the trigram check if the upperbound on content
// is greater than our threshold.
var trigrams map[ngram]struct{}
if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound > maxTrigramCount {
trigrams = make(map[ngram]struct{}, maxTrigramCount+1)
}

var cur [3]rune
byteCount := 0
for len(content) > 0 {
if content[0] == 0 {
return fmt.Errorf("binary data at byte offset %d", byteCount)
}

r, sz := utf8.DecodeRune(content)
content = content[sz:]
byteCount += sz

cur[0], cur[1], cur[2] = cur[1], cur[2], r
if cur[0] == 0 {
// start of file.
continue
}

if trigrams != nil {
trigrams[runesToNGram(cur)] = struct{}{}
if len(trigrams) > maxTrigramCount {
// probably not text.
return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
}
}
}
return nil
}

func (b *IndexBuilder) populateSubRepoIndices() error {
if len(b.subRepoIndices) == len(b.repoList) {
return nil
Expand Down Expand Up @@ -559,3 +514,61 @@ func (b *IndexBuilder) branchMask(br string) uint64 {
}
return 0
}

type DocChecker struct {
// A map to count the unique trigrams in a doc. Reused across docs to cut down on allocations.
trigrams map[ngram]struct{}
}

// Check returns a reason why the given contents are probably not source texts.
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
if len(content) == 0 {
return nil
}

if len(content) < ngramSize {
return fmt.Errorf("file size smaller than %d", ngramSize)
}

if index := bytes.IndexByte(content, 0); index > 0 {
return fmt.Errorf("binary data at byte offset %d", index)
}

// PERF: we only need to do the trigram check if the upperbound on content is greater than
// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
return nil
}

var cur [3]rune
byteCount := 0
t.clearTrigrams(maxTrigramCount)

for len(content) > 0 {
r, sz := utf8.DecodeRune(content)
content = content[sz:]
byteCount += sz

cur[0], cur[1], cur[2] = cur[1], cur[2], r
if cur[0] == 0 {
// start of file.
continue
}

t.trigrams[runesToNGram(cur)] = struct{}{}
if len(t.trigrams) > maxTrigramCount {
// probably not text.
return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
}
}
return nil
}

func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
if t.trigrams == nil {
t.trigrams = make(map[ngram]struct{}, maxTrigramCount)
}
for key := range t.trigrams {
delete(t.trigrams, key)
}
}
2 changes: 1 addition & 1 deletion install-ctags-alpine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ set -eux
# shellcheck disable=SC3040
set -o pipefail || true
# Commit from 2023-10-24. Please always pick a commit from the main branch.
export SOURCEGRAPH_COMMIT=4dd4ce3d91da5cac2ac6169d3005714247178f57
export SOURCEGRAPH_COMMIT=45a6748bb491513b9e1162d888711ca9b3bb4303
wget -O - https://raw.githubusercontent.com/sourcegraph/sourcegraph/$SOURCEGRAPH_COMMIT/cmd/symbols/ctags-install-alpine.sh | sh

0 comments on commit 13ed87a

Please sign in to comment.