Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid overlapping trigrams in distanceHitIterator #779

Merged
merged 1 commit into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (n ngram) String() string {
type runeNgramOff struct {
ngram ngram
// index is the original index inside of the returned array of splitNGrams
index uint32
index int
}

func (a runeNgramOff) Compare(b runeNgramOff) int {
Expand Down Expand Up @@ -149,7 +149,7 @@ func splitNGrams(str []byte) []runeNgramOff {
ng := runesToNGram(runeGram)
result = append(result, runeNgramOff{
ngram: ng,
index: uint32(len(result)),
index: len(result),
})
}
return result
Expand Down
4 changes: 2 additions & 2 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ func TestSearchStats(t *testing.T) {
Want: Stats{
FilesLoaded: 1,
ContentBytesLoaded: 22,
IndexBytesLoaded: 8,
IndexBytesLoaded: 10,
NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND
NgramLookups: 104,
MatchCount: 2,
Expand Down Expand Up @@ -556,7 +556,7 @@ func TestSearchStats(t *testing.T) {
}},
Want: Stats{
ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents
IndexBytesLoaded: 8,
IndexBytesLoaded: 10,
FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index
FilesLoaded: 2,
MatchCount: 0, // even though there is a match it doesn't align with a symbol
Expand Down
48 changes: 36 additions & 12 deletions indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,31 @@ func min2Index(xs []uint32) (idx0, idx1 int) {
return
}

// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
// the distance iterator. If they have the same frequency, we maximise the
// distance between them. first will always have a smaller index than last.
// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
// produce a small file intersection. It finds the two lowest frequency ngrams, making
// sure to maximize the distance between them in case of ties. It avoids overlapping
// trigrams to keep their intersection as small as possible.
//
// Invariant: first will always have a smaller index than last.
func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)

// If the trigrams are overlapping, then try to shift one to reduce overlap.
// This is guaranteed to produce a smaller intersection.
if last.index-first.index < ngramSize {
newFirstIndex := max(last.index-ngramSize, 0)
if newFirstIndex != first.index {
first = ngramOffs[indexMap[newFirstIndex]]
}

newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
if newLastIndex != last.index {
last = ngramOffs[indexMap[newLastIndex]]
}
}
return
}

func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
firstI, lastI := min2Index(frequencies)
// If the frequencies are equal lets maximise distance in the query
Copy link
Member Author

@jtibshirani jtibshirani May 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow-up, I'd like to try removing the "maximize distance" heuristic. I looked at some examples, and it's not clear to me that maximizing distance between "AAA" and "AAA" is better than filtering on "AAAAAA". I wonder if this was important before just because we commonly had overlapping trigrams. So before we would have a lot of overlap like "AAAA", whereas with my change this isn't a problem.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this optimization is likely unnecessary. I think it is only for the same trigram, and in that case I do think it makes a difference. But this seems like a very rare thing to happen and in the case it does happen it doesn't give us that much better perf.

For the examples of being different trigrams I don't see it making a difference that is worth the complexity.

Expand All @@ -357,13 +379,15 @@ func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (f
}
}
}

first = ngramOffs[firstI]
last = ngramOffs[lastI]
// Ensure first appears before last to make distance logic below clean.

// Ensure first appears before last as a helpful invariant.
if first.index > last.index {
last, first = first, last
}
return first, last
return
}

func (data *indexData) ngrams(filename bool) btreeIndex {
Expand Down Expand Up @@ -412,9 +436,10 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
// bucket (which can cause disk IO).
slices.SortFunc(ngramOffs, runeNgramOff.Compare)
frequencies := make([]uint32, 0, len(ngramOffs))
indexMap := make([]int, len(ngramOffs))
ngramLookups := 0
ngrams := d.ngrams(query.FileName)
for _, o := range ngramOffs {
for i, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = ngrams.Get(o.ngram).sz
Expand All @@ -438,15 +463,14 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
}

frequencies = append(frequencies, freq)
indexMap[o.index] = i
}

// first and last are now the smallest trigram posting lists to iterate
// through.
first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)

iter := &ngramDocIterator{
leftPad: first.index,
rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
leftPad: uint32(first.index),
rightPad: uint32(utf8.RuneCountInString(str) - first.index),
ngramLookups: ngramLookups,
}
if query.FileName {
Expand All @@ -456,7 +480,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
}

if first != last {
runeDist := last.index - first.index
runeDist := uint32(last.index - first.index)
i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
Expand Down
28 changes: 28 additions & 0 deletions indexdata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,31 @@ func TestMinFrequencyNgramOffsets(t *testing.T) {
t.Fatal(err)
}
}

func TestFindSelectiveNGrams(t *testing.T) {
if err := quick.Check(func(s string, maxFreq uint16) bool {
ngramOffs := splitNGrams([]byte(s))
if len(ngramOffs) == 0 {
return true
}

slices.SortFunc(ngramOffs, runeNgramOff.Compare)
indexMap := make([]int, len(ngramOffs))
for i, n := range ngramOffs {
indexMap[n.index] = i
}

frequencies := genFrequencies(ngramOffs, int(maxFreq))
x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies)

if len(ngramOffs) <= 1 {
return true
}

// Just assert the invariant that x0 is before x1. This test mostly checks
// for out-of-bounds errors.
return x0.index < x1.index
}, nil); err != nil {
t.Fatal(err)
}
}
Loading