Skip to content

Commit

Permalink
index: experiment to limit ngram lookups for large snippets (#795)
Browse files Browse the repository at this point in the history
This introduces an experiment where we can stop looking up ngrams at a
certain limit. The insight here is that for large substrings we spend
more time finding the smallest ngram frequency than the time a normal
search takes. So instead we can try and find a good balance between
looking for a good (two) ngrams and actually searching the corpus.

The plan is to set different values for
SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT in sourcegraph production and
see how it affects performance of attribution search service.

Test Plan: ran all tests with the envvar set to 2. I expected tests that
assert on stats to fail, but everything else to pass. This was the case.

  SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT=2 go test ./...
  • Loading branch information
keegancsmith authored Jul 26, 2024
1 parent 5ac92b1 commit 12ce07a
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 2 deletions.
7 changes: 6 additions & 1 deletion bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,19 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
}

func splitNGrams(str []byte) []runeNgramOff {
// len(maxNgrams) >= the number of ngrams in str => no limit
return splitNGramsLimit(str, len(str))
}

func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
var runeGram [3]rune
var off [3]uint32
var runeCount int

result := make([]runeNgramOff, 0, len(str))
var i uint32

for len(str) > 0 {
for len(str) > 0 && len(result) < maxNgrams {
r, sz := utf8.DecodeRune(str)
str = str[sz:]
runeGram[0] = runeGram[1]
Expand Down
30 changes: 29 additions & 1 deletion indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ import (
"hash/crc64"
"log"
"math/bits"
"os"
"slices"
"strconv"
"unicode/utf8"

"github.com/sourcegraph/zoekt/query"
Expand Down Expand Up @@ -401,11 +403,37 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
return cs
}

// experimentIterateNgramLookupLimit when non-zero will only lookup this many
// ngrams from a query string. Note: that if case-insensitive, this only
// limits the input. So we will still lookup the case folding.
//
// This experiment is targetting looking up large snippets. If it is
// successful, we will likely hardcode the value we use in production.
//
// Future note: if we find cases where this works badly, we can consider only
// searching a random subset of the query string to avoid bad strings.
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")

func getEnvInt(k string) int {
v, _ := strconv.Atoi(os.Getenv(k))
if v != 0 {
log.Printf("%s = %d\n", k, v)
}
return v
}

func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
str := query.Pattern

// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
var ngramOffs []runeNgramOff
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
// length is asked later on for other optimizations.
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
} else {
ngramOffs = splitNGrams([]byte(str))
}

// protect against accidental searching of empty strings
if len(ngramOffs) == 0 {
Expand Down

0 comments on commit 12ce07a

Please sign in to comment.