Skip to content

Commit

Permalink
index: experiment to limit ngram lookups for large snippets
Browse files Browse the repository at this point in the history
This introduces an experiment where we can stop looking up ngrams at a
certain limit. The insight here is that for large substrings we spend
more time finding the smallest ngram frequency than the time a normal
search takes. So instead we can try and find a good balance between
looking for a good (two) ngrams and actually searching the corpus.

The plan is to set different values for
SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT in sourcegraph production and
see how it affects performance of attribution search service.

Test Plan: ran all tests with the envvar set to 2. I expected tests that
assert on stats to fail, but everything else to pass. This was the case.

  SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT=2 go test ./...
  • Loading branch information
keegancsmith committed Jul 26, 2024
1 parent 5ac92b1 commit 1b05951
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 2 deletions.
7 changes: 6 additions & 1 deletion bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,19 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
}

func splitNGrams(str []byte) []runeNgramOff {
// len(maxNgrams) >= the number of ngrams in str => no limit
return splitNGramsLimit(str, len(str))
}

func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
var runeGram [3]rune
var off [3]uint32
var runeCount int

result := make([]runeNgramOff, 0, len(str))
var i uint32

for len(str) > 0 {
for len(str) > 0 && len(result) < maxNgrams {
r, sz := utf8.DecodeRune(str)
str = str[sz:]
runeGram[0] = runeGram[1]
Expand Down
30 changes: 29 additions & 1 deletion indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ import (
"hash/crc64"
"log"
"math/bits"
"os"
"slices"
"strconv"
"unicode/utf8"

"github.com/sourcegraph/zoekt/query"
Expand Down Expand Up @@ -401,11 +403,37 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
return cs
}

// experimentIterateNgramLookupLimit when non-zero will only lookup this many
// ngrams from a query string. Note: that if case-insensitive, this only
// limits the input. So we will still lookup the case folding.
//
// This experiment is targetting looking up large snippets. If it is
// successful, we will likely hardcode the value we use in production.
//
// Future note: if we find cases where this works badly, we can consider only
// searching a random subset of the query string to avoid bad strings.
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")

func getEnvInt(k string) int {
v, _ := strconv.Atoi(os.Getenv(k))
if v != 0 {
log.Printf("%s = %d\n", k, v)
}
return v
}

func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
str := query.Pattern

// Find the 2 least common ngrams from the string.
ngramOffs := splitNGrams([]byte(query.Pattern))
var ngramOffs []runeNgramOff
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
// length is asked later on for other optimizations.
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
} else {
ngramOffs = splitNGrams([]byte(str))
}

// protect against accidental searching of empty strings
if len(ngramOffs) == 0 {
Expand Down

0 comments on commit 1b05951

Please sign in to comment.