From df7a7e7162cf7d7af4d4cdde3701c57950830676 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 15 May 2024 09:08:24 -0700 Subject: [PATCH] Simplify trigram selection in distanceHitIterator (#782) Follow up to #779. This PR removes the logic for trigrams with the same frequency, because it will no longer have a big effect. --- indexdata.go | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/indexdata.go b/indexdata.go index 2fdd6ef5a..20ea48887 100644 --- a/indexdata.go +++ b/indexdata.go @@ -320,26 +320,9 @@ func (d *indexData) memoryUse() int { return sz } -const maxUInt32 = 0xffffffff - -func min2Index(xs []uint32) (idx0, idx1 int) { - min0, min1 := uint32(maxUInt32), uint32(maxUInt32) - for i, x := range xs { - if x <= min0 { - idx0, idx1 = i, idx0 - min0, min1 = x, min0 - } else if x <= min1 { - idx1 = i - min1 = x - } - } - return -} - // findSelectiveNgrams returns two ngrams to pass to the distance iterator, chosen to -// produce a small file intersection. It finds the two lowest frequency ngrams, making -// sure to maximize the distance between them in case of ties. It avoids overlapping -// trigrams to keep their intersection as small as possible. +// produce a small file intersection. It finds the two lowest frequency ngrams, but avoids +// overlapping trigrams to keep their intersection as small as possible. // // Invariant: first will always have a smaller index than last. func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies []uint32) (first, last runeNgramOff) { @@ -361,27 +344,24 @@ func findSelectiveNgrams(ngramOffs []runeNgramOff, indexMap []int, frequencies [ return } +const maxUInt32 = 0xffffffff + func minFrequencyNgramOffsets(ngramOffs []runeNgramOff, frequencies []uint32) (first, last runeNgramOff) { - firstI, lastI := min2Index(frequencies) - // If the frequencies are equal lets maximise distance in the query - // string. This optimization normally triggers for long repeated trigrams - // in a string, eg a query like "AAAAA..." - if frequencies[firstI] == frequencies[lastI] { - for i, freq := range frequencies { - if freq != frequencies[firstI] { - continue - } - if ngramOffs[i].index < ngramOffs[firstI].index { - firstI = i - } - if ngramOffs[i].index > ngramOffs[lastI].index { - lastI = i - } + // Find the two lowest frequency ngrams. + idx0, idx1 := 0, 0 + min0, min1 := uint32(maxUInt32), uint32(maxUInt32) + for i, x := range frequencies { + if x <= min0 { + idx0, idx1 = i, idx0 + min0, min1 = x, min0 + } else if x <= min1 { + idx1 = i + min1 = x } } - first = ngramOffs[firstI] - last = ngramOffs[lastI] + first = ngramOffs[idx0] + last = ngramOffs[idx1] // Ensure first appears before last as a helpful invariant. if first.index > last.index {