Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

index: experiment to limit ngram lookups for large snippets #795

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,19 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
}

func splitNGrams(str []byte) []runeNgramOff {
// len(maxNgrams) >= the number of ngrams in str => no limit
return splitNGramsLimit(str, len(str))
}

func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
var runeGram [3]rune
var off [3]uint32
var runeCount int

result := make([]runeNgramOff, 0, len(str))
var i uint32

for len(str) > 0 {
for len(str) > 0 && len(result) < maxNgrams {
r, sz := utf8.DecodeRune(str)
str = str[sz:]
runeGram[0] = runeGram[1]
Expand Down
30 changes: 29 additions & 1 deletion indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ import (
"hash/crc64"
"log"
"math/bits"
"os"
"slices"
"strconv"
"unicode/utf8"

"github.com/sourcegraph/zoekt/query"
Expand Down Expand Up @@ -401,11 +403,37 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
return cs
}

// experimentIterateNgramLookupLimit when non-zero will only lookup this many
// ngrams from a query string. Note: that if case-insensitive, this only
// limits the input. So we will still lookup the case folding.
//
// This experiment is targetting looking up large snippets. If it is
// successful, we will likely hardcode the value we use in production.
//
// Future note: if we find cases where this works badly, we can consider only
// searching a random subset of the query string to avoid bad strings.
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")

func getEnvInt(k string) int {
v, _ := strconv.Atoi(os.Getenv(k))
if v != 0 {
log.Printf("%s = %d\n", k, v)
}
return v
}

func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
str := query.Pattern

// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
var ngramOffs []runeNgramOff
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
// length is asked later on for other optimizations.
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
} else {
ngramOffs = splitNGrams([]byte(str))
}

// protect against accidental searching of empty strings
if len(ngramOffs) == 0 {
Expand Down
Loading