Skip to content

Commit

Permalink
Refactor proximity map creation algorithm to accept an iterator.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicktobey committed Oct 22, 2024
1 parent 9429fa9 commit 50dc53e
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 54 deletions.
100 changes: 56 additions & 44 deletions go/store/prolly/proximity_map.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,30 @@ func NewProximityMap(ctx context.Context, ns tree.NodeStore, node tree.Node, key
}
}

var levelMapKeyDesc = val.NewTupleDescriptor(
val.Type{Enc: val.Uint8Enc, Nullable: false},
val.Type{Enc: val.ByteStringEnc, Nullable: false},
)

// NewProximityMapFromTuples creates a new ProximityMap from a given list of key-value pairs.
func NewProximityMapFromTuples(ctx context.Context, ns tree.NodeStore, distanceType expression.DistanceType, keyDesc val.TupleDesc, valDesc val.TupleDesc, keys [][]byte, values [][]byte, logChunkSize uint8) (ProximityMap, error) {
builder := proximityMapBuilder{
func NewProximityMapFromTuples(ctx context.Context, ns tree.NodeStore, distanceType expression.DistanceType, keyDesc val.TupleDesc, valDesc val.TupleDesc, logChunkSize uint8) (proximityMapBuilder, error) {

emptyLevelMap, err := NewMapFromTuples(ctx, ns, levelMapKeyDesc, valDesc)
if err != nil {
return proximityMapBuilder{}, err
}
mutableLevelMap := newMutableMap(emptyLevelMap)
return proximityMapBuilder{
ns: ns,
vectorIndexSerializer: message.NewVectorIndexSerializer(ns.Pool()),
distanceType: distanceType,
keyDesc: keyDesc,
valDesc: valDesc,
logChunkSize: logChunkSize,
}
return builder.build(ctx, keys, values)

maxLevel: 0,
levelMap: mutableLevelMap,
}, nil
}

// proximityMapBuilder is effectively a namespace for helper functions used in creating a ProximityMap.
Expand All @@ -100,6 +113,21 @@ type proximityMapBuilder struct {
distanceType expression.DistanceType
keyDesc, valDesc val.TupleDesc
logChunkSize uint8

maxLevel uint8
levelMap *MutableMap
}

func (b *proximityMapBuilder) Insert(ctx context.Context, key, value []byte) error {
keyLevel := tree.DeterministicHashLevel(b.logChunkSize, key)
if keyLevel > b.maxLevel {
b.maxLevel = keyLevel
}

levelMapKeyBuilder := val.NewTupleBuilder(levelMapKeyDesc)
levelMapKeyBuilder.PutUint8(0, 255-keyLevel)
levelMapKeyBuilder.PutByteString(1, key)
return b.levelMap.Put(ctx, levelMapKeyBuilder.Build(b.ns.Pool()), value)
}

func (b *proximityMapBuilder) makeRootNode(ctx context.Context, keys, values [][]byte, subtrees []uint64, level int) (ProximityMap, error) {
Expand All @@ -116,7 +144,7 @@ func (b *proximityMapBuilder) makeRootNode(ctx context.Context, keys, values [][
return NewProximityMap(ctx, b.ns, rootNode, b.keyDesc, b.valDesc, b.distanceType), nil
}

func (b *proximityMapBuilder) build(ctx context.Context, keys, values [][]byte) (ProximityMap, error) {
func (b *proximityMapBuilder) Flush(ctx context.Context) (ProximityMap, error) {
// The algorithm for building a ProximityMap's tree requires us to start at the root and build out to the leaf nodes.
// Given that our trees are Merkle Trees, this presents an obvious problem.
// Our solution is to create the final tree by applying a series of transformations to intermediate trees.
Expand Down Expand Up @@ -148,64 +176,48 @@ func (b *proximityMapBuilder) build(ctx context.Context, keys, values [][]byte)
// separate in-memory NodeStore for these values.

// Check if index is empty.
if len(keys) == 0 {
if !b.levelMap.HasEdits() {
return b.makeRootNode(ctx, nil, nil, nil, 0)
}

// Step 1: Create `levelMap`, a map from (indexLevel, keyBytes) -> values
// We want the index to be sorted by level (descending), so currently we store the level in the map as
// 255 - the actual level. TODO: Implement a ReverseIter for MutableMap and use that instead.
levelMap, maxLevel, err := b.makeLevelMap(ctx, keys, values)
if err != nil {
return ProximityMap{}, err
}

if maxLevel == 0 {
if b.maxLevel == 0 {
// index is a single node.
// assuming that the keys are already sorted, we can return them unmodified.
levelMapIter, err := b.levelMap.IterAll(ctx)
if err != nil {
return ProximityMap{}, err
}
var keys, values [][]byte
for {
key, value, err := levelMapIter.Next(ctx)
if err == io.EOF {
break
}
originalKey, _ := levelMapKeyDesc.GetBytes(1, key)
if err != nil {
return ProximityMap{}, err
}
keys = append(keys, originalKey)
values = append(values, value)
}
return b.makeRootNode(ctx, keys, values, nil, 0)
}

// Step 2: Create `pathMaps`, a list of maps, each corresponding to a different level of the ProximityMap
pathMaps, err := b.makePathMaps(ctx, levelMap)
pathMaps, err := b.makePathMaps(ctx, b.levelMap)
if err != nil {
return ProximityMap{}, err
}

// Step 3: Create an iter over each `pathMap` created in the previous step, and walk the shape of the final ProximityMap,
// generating Nodes as we go.
return b.makeProximityMapFromPathMaps(ctx, pathMaps)
}

// makeLevelMap creates a prolly map where the key is prefixed by the maximum level of that row in the corresponding ProximityMap.
func (b *proximityMapBuilder) makeLevelMap(ctx context.Context, keys [][]byte, values [][]byte) (levelMap *MutableMap, maxLevel uint8, err error) {
levelMapKeyDesc := val.NewTupleDescriptor(
val.Type{Enc: val.Uint8Enc, Nullable: false},
val.Type{Enc: val.ByteStringEnc, Nullable: false},
)

emptyLevelMap, err := NewMapFromTuples(ctx, b.ns, levelMapKeyDesc, b.valDesc)
if err != nil {
return nil, 0, err
}
mutableLevelMap := newMutableMap(emptyLevelMap)

for i := 0; i < len(keys); i++ {
key := keys[i]
keyLevel := tree.DeterministicHashLevel(b.logChunkSize, []byte(key))
if keyLevel > maxLevel {
maxLevel = keyLevel
}

levelMapKeyBuilder := val.NewTupleBuilder(levelMapKeyDesc)
levelMapKeyBuilder.PutUint8(0, 255-keyLevel)
levelMapKeyBuilder.PutByteString(1, key)
err = mutableLevelMap.Put(ctx, levelMapKeyBuilder.Build(b.ns.Pool()), values[i])
if err != nil {
return nil, 0, err
}
}

return mutableLevelMap, maxLevel, nil
}

// makePathMaps creates a set of prolly maps, each of which corresponds to a different level in the to-be-built ProximityMap
func (b *proximityMapBuilder) makePathMaps(ctx context.Context, mutableLevelMap *MutableMap) ([]*MutableMap, error) {
levelMapIter, err := mutableLevelMap.IterAll(ctx)
Expand Down
24 changes: 16 additions & 8 deletions go/store/prolly/proximity_map_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,29 @@ func createProximityMap(t *testing.T, ctx context.Context, ns tree.NodeStore, ve

distanceType := expression.DistanceL2Squared{}

builder, err := NewProximityMapFromTuples(ctx, ns, distanceType, kd, vd, logChunkSize)
require.NoError(t, err)

keys := make([][]byte, count)
values := make([][]byte, count)
keyBuilder := val.NewTupleBuilder(kd)
valueBuilder := val.NewTupleBuilder(vd)
for i, vector := range vectors {
keyBuilder.PutJSONAddr(0, newJsonDocument(t, ctx, ns, vector))
keys[i] = keyBuilder.Build(bp)
}
nextKey := keyBuilder.Build(bp)
keys[i] = nextKey

valueBuilder := val.NewTupleBuilder(vd)
values := make([][]byte, count)
for i, pk := range pks {
valueBuilder.PutInt64(0, pk)
values[i] = valueBuilder.Build(bp)
valueBuilder.PutInt64(0, pks[i])
nextValue := valueBuilder.Build(bp)
values[i] = nextValue

err = builder.Insert(ctx, nextKey, nextValue)
require.NoError(t, err)
}

m, err := NewProximityMapFromTuples(ctx, ns, distanceType, kd, vd, keys, values, logChunkSize)
m, err := builder.Flush(ctx)
require.NoError(t, err)

require.NoError(t, err)
mapCount, err := m.Count()
require.NoError(t, err)
Expand Down
4 changes: 2 additions & 2 deletions integration-tests/go-sql-server-driver/go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
module github.com/dolthub/dolt/integration-tests/go-sql-server-driver

go 1.22.5
go 1.23.0

toolchain go1.22.7
toolchain go1.23.2

require (
github.com/dolthub/dolt/go v0.40.4
Expand Down

0 comments on commit 50dc53e

Please sign in to comment.