Skip to content

Commit

Permalink
Avoid overlapping trigrams in distanceHitIterator
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed May 7, 2024
1 parent 4e674a4 commit a35ccdc
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 16 deletions.
4 changes: 2 additions & 2 deletions bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (n ngram) String() string {
type runeNgramOff struct {
ngram ngram
// index is the original index inside of the returned array of splitNGrams
index uint32
index int
}

func (a runeNgramOff) Compare(b runeNgramOff) int {
Expand Down Expand Up @@ -149,7 +149,7 @@ func splitNGrams(str []byte) []runeNgramOff {
ng := runesToNGram(runeGram)
result = append(result, runeNgramOff{
ngram: ng,
index: uint32(len(result)),
index: len(result),
})
}
return result
Expand Down
4 changes: 2 additions & 2 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ func TestSearchStats(t *testing.T) {
Want: Stats{
FilesLoaded: 1,
ContentBytesLoaded: 22,
IndexBytesLoaded: 8,
IndexBytesLoaded: 10,
NgramMatches: 3, // we look at doc 1, because it's max(0,1) due to AND
NgramLookups: 104,
MatchCount: 2,
Expand Down Expand Up @@ -556,7 +556,7 @@ func TestSearchStats(t *testing.T) {
}},
Want: Stats{
ContentBytesLoaded: 33, // we still have to run regex since "app" matches two documents
IndexBytesLoaded: 8,
IndexBytesLoaded: 10,
FilesConsidered: 2, // important that we don't check 3 to ensure we are using the index
FilesLoaded: 2,
MatchCount: 0, // even though there is a match it doesn't align with a symbol
Expand Down
48 changes: 36 additions & 12 deletions indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,31 @@ func min2Index(xs []uint32) (idx0, idx1 int) {
return
}

// minFrequencyNgramOffsets returns the two lowest frequency ngrams to pass to
// the distance iterator. If they have the same frequency, we maximise the
// distance between them. first will always have a smaller index than last.
// findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to
// produce a small file intersection. It finds the two lowest frequency ngrams, making
// sure to maximize the distance between them in case of ties. It avoids overlapping
// trigrams to keep their intersection as small as possible.
//
// Invariant: first will always have a smaller index than last.
func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) {
first, last = minFrequencyNgramOffsets(ngramOffs, frequencies)

// If the trigrams are overlapping, then try to shift one to reduce overlap.
// This is guaranteed to produce a smaller intersection.
if last.index-first.index < ngramSize {
newFirstIndex := max(last.index-ngramSize, 0)
if newFirstIndex != first.index {
first = ngramOffs[indexMap[newFirstIndex]]
}

newLastIndex := min(first.index+ngramSize, len(ngramOffs)-1)
if newLastIndex != last.index {
last = ngramOffs[indexMap[newLastIndex]]
}
}
return
}

func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) {
firstI, lastI := min2Index(frequencies)
// If the frequencies are equal lets maximise distance in the query
Expand All @@ -357,13 +379,15 @@ func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (f
}
}
}

first = ngramOffs[firstI]
last = ngramOffs[lastI]
// Ensure first appears before last to make distance logic below clean.

// Ensure first appears before last as a helpful invariant.
if first.index > last.index {
last, first = first, last
}
return first, last
return
}

func (data *indexData) ngrams(filename bool) btreeIndex {
Expand Down Expand Up @@ -412,9 +436,10 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
// bucket (which can cause disk IO).
slices.SortFunc(ngramOffs, runeNgramOff.Compare)
frequencies := make([]uint32, 0, len(ngramOffs))
indexMap := make([]int, len(ngramOffs))
ngramLookups := 0
ngrams := d.ngrams(query.FileName)
for _, o := range ngramOffs {
for i, o := range ngramOffs {
var freq uint32
if query.CaseSensitive {
freq = ngrams.Get(o.ngram).sz
Expand All @@ -438,15 +463,14 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
}

frequencies = append(frequencies, freq)
indexMap[o.index] = i
}

// first and last are now the smallest trigram posting lists to iterate
// through.
first, last := minFrequencyNgramOffsets(ngramOffs, frequencies)
first, last := findSelectiveNgrams(ngramOffs, indexMap, frequencies)

iter := &ngramDocIterator{
leftPad: first.index,
rightPad: uint32(utf8.RuneCountInString(str)) - first.index,
leftPad: uint32(first.index),
rightPad: uint32(utf8.RuneCountInString(str) - first.index),
ngramLookups: ngramLookups,
}
if query.FileName {
Expand All @@ -456,7 +480,7 @@ func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResult
}

if first != last {
runeDist := last.index - first.index
runeDist := uint32(last.index - first.index)
i, err := d.newDistanceTrigramIter(first.ngram, last.ngram, runeDist, query.CaseSensitive, query.FileName)
if err != nil {
return nil, err
Expand Down
28 changes: 28 additions & 0 deletions indexdata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,31 @@ func TestMinFrequencyNgramOffsets(t *testing.T) {
t.Fatal(err)
}
}

func TestFindSelectiveNGrams(t *testing.T) {
if err := quick.Check(func(s string, maxFreq uint16) bool {
ngramOffs := splitNGrams([]byte(s))
if len(ngramOffs) == 0 {
return true
}

slices.SortFunc(ngramOffs, runeNgramOff.Compare)
indexMap := make([]int, len(ngramOffs))
for i, n := range ngramOffs {
indexMap[n.index] = i
}

frequencies := genFrequencies(ngramOffs, int(maxFreq))
x0, x1 := findSelectiveNgrams(ngramOffs, indexMap, frequencies)

if len(ngramOffs) <= 1 {
return true
}

// Just assert the invariant that x0 is before x1. This test mostly checks
// for out-of-bounds errors.
return x0.index < x1.index
}, nil); err != nil {
t.Fatal(err)
}
}

0 comments on commit a35ccdc

Please sign in to comment.