Skip to content

Commit

Permalink
seg: add BinarySearch method (#12618)
Browse files Browse the repository at this point in the history
  • Loading branch information
AskAlexSharov authored Nov 6, 2024
1 parent a5becca commit f4ece0d
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 19 deletions.
26 changes: 26 additions & 0 deletions erigon-lib/seg/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"sync/atomic"
"time"
Expand Down Expand Up @@ -1064,3 +1065,28 @@ func (g *Getter) FastNext(buf []byte) ([]byte, uint64) {
g.dataBit = 0
return buf[:wordLen], postLoopPos
}

// BinarySearch - !expecting sorted file - does Seek `g` to key which >= `fromPrefix` by using BinarySearch - means unoptimal and touching many places in file
// use `.Next` to read found
// at `ok = false` leaving `g` in unpredictible state
func (g *Getter) BinarySearch(seek []byte, count int, getOffset func(i uint64) (offset uint64)) (foundOffset uint64, ok bool) {
var key []byte
foundItem := sort.Search(count, func(i int) bool {
offset := getOffset(uint64(i))
g.Reset(offset)
if g.HasNext() {
key, _ = g.Next(key[:0])
return bytes.Compare(key, seek) >= 0
}
return false
})
if foundItem == count { // `Search` returns `n` if not found
return 0, false
}
foundOffset = getOffset(uint64(foundItem))
g.Reset(foundOffset)
if !g.HasNext() {
return 0, false
}
return foundOffset, true
}
87 changes: 68 additions & 19 deletions erigon-lib/seg/decompress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"math/rand"
"os"
"path/filepath"
"slices"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -257,22 +258,23 @@ func prepareLoremDictUncompressed(t *testing.T) *Decompressor {
cfg.MinPatternScore = 1
cfg.Workers = 2
c, err := NewCompressor(context.Background(), t.Name(), file, tmpDir, cfg, log.LvlDebug, logger)
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
defer c.Close()
slices.Sort(loremStrings)
for k, w := range loremStrings {
if err = c.AddUncompressedWord([]byte(fmt.Sprintf("%s %d", w, k))); err != nil {
t.Fatal(err)
if len(w) == 0 {
err = c.AddUncompressedWord([]byte(w))
require.NoError(t, err)
continue
}
err = c.AddUncompressedWord([]byte(fmt.Sprintf("%s %d", w, k)))
require.NoError(t, err)
}
if err = c.Compress(); err != nil {
t.Fatal(err)
}
var d *Decompressor
if d, err = NewDecompressor(file); err != nil {
t.Fatal(err)
}
err = c.Compress()
require.NoError(t, err)
d, err := NewDecompressor(file)
require.NoError(t, err)
t.Cleanup(d.Close)
return d
}

Expand All @@ -281,16 +283,60 @@ func TestUncompressed(t *testing.T) {
defer d.Close()
g := d.MakeGetter()
i := 0
var offsets []uint64
offsets = append(offsets, 0)
for g.HasNext() {
w := loremStrings[i]
expected := []byte(fmt.Sprintf("%s %d", w, i+1))
expected = expected[:len(expected)/2]
actual, _ := g.NextUncompressed()
actual, offset := g.NextUncompressed()
if bytes.Equal(expected, actual) {
t.Errorf("expected %s, actual %s", expected, actual)
}
i++
}
offsets = append(offsets, offset)
}

t.Run("BinarySearch middle", func(t *testing.T) {
require := require.New(t)
_, ok := g.BinarySearch([]byte("ipsum"), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ := g.Next(nil)
require.Equal("ipsum 38", string(k))
_, ok = g.BinarySearch([]byte("ipsu"), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ = g.Next(nil)
require.Equal("ipsum 38", string(k))
})
t.Run("BinarySearch end of file", func(t *testing.T) {
require := require.New(t)
//last word is `voluptate`
_, ok := g.BinarySearch([]byte("voluptate"), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ := g.Next(nil)
require.Equal("voluptate 69", string(k))
_, ok = g.BinarySearch([]byte("voluptat"), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ = g.Next(nil)
require.Equal("voluptate 69", string(k))
_, ok = g.BinarySearch([]byte("voluptatez"), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.False(ok)
})

t.Run("BinarySearch begin of file", func(t *testing.T) {
require := require.New(t)
//first word is ``
_, ok := g.BinarySearch([]byte(""), d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ := g.Next(nil)
require.Equal("", string(k))

_, ok = g.BinarySearch(nil, d.Count(), func(i uint64) (offset uint64) { return offsets[i] })
require.True(ok)
k, _ = g.Next(nil)
require.Equal("", string(k))
})

}

func TestDecompressor_OpenCorrupted(t *testing.T) {
Expand Down Expand Up @@ -461,12 +507,15 @@ func TestDecompressor_OpenCorrupted(t *testing.T) {
})
}

const lorem = `Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et
dolore magna aliqua Ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur
Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`
const lorem = `lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et
dolore magna aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur
excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum`

var loremStrings = strings.Split(lorem, " ")
var loremStrings = append(strings.Split(rmNewLine(lorem), " "), "") // including emtpy string - to trigger corner cases
func rmNewLine(s string) string {
return strings.ReplaceAll(strings.ReplaceAll(s, "\n", " "), "\r", "")
}

func TestDecompressTorrent(t *testing.T) {
t.Skip()
Expand Down

0 comments on commit f4ece0d

Please sign in to comment.