Skip to content

Commit

Permalink
vecindex: support deleting vectors from C-SPANN index
Browse files Browse the repository at this point in the history
Add Delete method to the vector index, which attempts to remove a vector
from the index, given its value and primary key. Delete may not be able
to locate the vector in the index, leaving a "dangling vector" reference
which other methods need to take care not to ignore.

Epic: CRDB-42943

Release note: None
  • Loading branch information
andy-kimball committed Nov 15, 2024
1 parent 89df825 commit 8233287
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 19 deletions.
138 changes: 137 additions & 1 deletion pkg/sql/vecindex/testdata/delete.ddt
Original file line number Diff line number Diff line change
@@ -1,5 +1,141 @@
# ----------
# Test deleting vectors from primary index, but not from secondary index.
# Construct new index with one vector in the root.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
----
• 1 (0, 0)
└───• vec1 (1, 2)

# Delete remaining vector in the root.
delete
vec1
----
• 1 (0, 0)

# ----------
# Construct new index with only duplicate vectors.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
vec2: (1, 2)
vec3: (1, 2)
vec4: (1, 2)
vec5: (1, 2)
vec6: (1, 2)
----
• 1 (1, 2)
├───• 2 (1, 2)
│ │
│ ├───• vec1 (1, 2)
│ └───• vec2 (1, 2)
└───• 3 (1, 2)
├───• vec3 (1, 2)
├───• vec4 (1, 2)
├───• vec5 (1, 2)
└───• vec6 (1, 2)

# Ensure the correct duplicates are deleted (i.e. with matching keys).
delete
vec1
vec5
----
• 1 (1, 2)
├───• 2 (1, 2)
│ │
│ └───• vec2 (1, 2)
└───• 3 (1, 2)
├───• vec3 (1, 2)
├───• vec4 (1, 2)
└───• vec6 (1, 2)

# ----------
# Construct new index with multiple levels.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=1
vec1: (1, 2)
vec2: (7, 4)
vec3: (4, 3)
vec4: (-4, 5)
vec5: (1, 11)
vec6: (1, -6)
vec7: (0, 4)
vec8: (-2, 8)
vec9: (2, 8)
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
│ │
│ ├───• vec1 (1, 2)
│ └───• vec6 (1, -6)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec4 (-4, 5)
│ ├───• vec7 (0, 4)
│ └───• vec2 (7, 4)
└───• 5 (0.3333, 9)
├───• vec5 (1, 11)
├───• vec8 (-2, 8)
└───• vec9 (2, 8)

# Test case where initial search fails to find vector to delete and it must be
# retried.
delete
vec1: (0, 8)
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
│ │
│ └───• vec6 (1, -6)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec4 (-4, 5)
│ ├───• vec7 (0, 4)
│ └───• vec2 (7, 4)
└───• 5 (0.3333, 9)
├───• vec5 (1, 11)
├───• vec8 (-2, 8)
└───• vec9 (2, 8)

# Delete multiple vectors.
delete
vec4
vec5
vec6
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec2 (7, 4)
│ └───• vec7 (0, 4)
└───• 5 (0.3333, 9)
├───• vec9 (2, 8)
└───• vec8 (-2, 8)

# ----------
# Construct new index with multiple levels.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/vecindex/testdata/insert.ddt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# ----------
# Construct empty index.
# ----------
new-index min-partition-size=1 max-partition-size=4 beam-size=2
----
• 1 (0, 0)
Expand Down
9 changes: 9 additions & 0 deletions pkg/sql/vecindex/vecstore/in_memory_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,15 @@ func (s *InMemoryStore) DeleteVector(txn Txn, key PrimaryKey) {
delete(s.mu.vectors, string(key))
}

// GetVector returns a single vector from the store, by its primary key. This
// is used during testing.
func (s *InMemoryStore) GetVector(key PrimaryKey) vector.T {
s.mu.Lock()
defer s.mu.Unlock()

return s.mu.vectors[string(key)]
}

// MarshalBinary saves the in-memory store as a bytes. This allows the store to
// be saved and later loaded without needing to rebuild it from scratch.
func (s *InMemoryStore) MarshalBinary() (data []byte, err error) {
Expand Down
10 changes: 10 additions & 0 deletions pkg/sql/vecindex/vecstore/search_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package vecstore

import (
"bytes"
"sort"

"github.com/cockroachdb/cockroach/pkg/util/container/heap"
Expand Down Expand Up @@ -168,6 +169,10 @@ type SearchSet struct {
// among the best results.
MaxExtraResults int

// MatchKey, if non-nil, filters out all search candidates that do not have
// a matching primary key.
MatchKey PrimaryKey

// Stats tracks useful information about the search, such as how many vectors
// and partitions were scanned.
Stats SearchStats
Expand All @@ -180,6 +185,11 @@ type SearchSet struct {
// Add includes a new candidate in the search set. If set limits have been
// reached, then the candidate with the farthest distance will be discarded.
func (ss *SearchSet) Add(candidate *SearchResult) {
if ss.MatchKey != nil && !bytes.Equal(ss.MatchKey, candidate.ChildKey.PrimaryKey) {
// Filter out candidates without a matching primary key.
return
}

// Fast path where no pruning is necessary.
if len(ss.results) < ss.MaxResults {
heap.Push[*SearchResult](&ss.results, candidate)
Expand Down
5 changes: 5 additions & 0 deletions pkg/sql/vecindex/vecstore/search_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,9 @@ func TestSearchSet(t *testing.T) {
otherSet.MaxExtraResults = 1
otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7})
require.Equal(t, SearchResults{result3, result1, result4, result7}, otherSet.PopResults())

// Ignore results without a matching primary key.
otherSet = SearchSet{MaxResults: 2, MatchKey: []byte{60}}
otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7})
require.Equal(t, SearchResults{result6}, otherSet.PopResults())
}
57 changes: 57 additions & 0 deletions pkg/sql/vecindex/vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,63 @@ func (vi *VectorIndex) Insert(
return vi.insertHelper(&parentSearchCtx, childKey, true /* allowRetry */)
}

// Delete attempts to remove a vector from the index, given its value and
// primary key. This is called within the scope of a transaction so that the
// index does not appear to change during the delete.
//
// NOTE: Delete may not be able to locate the vector in the index, meaning a
// "dangling vector" reference will be left in the tree. Vector index methods
// handle this rare case by checking for duplicates when returning search
// results.
func (vi *VectorIndex) Delete(
ctx context.Context, txn vecstore.Txn, vector vector.T, key vecstore.PrimaryKey,
) error {
// Search for the vector in the index.
searchCtx := searchContext{
Txn: txn,
Original: vector,
Level: vecstore.LeafLevel,
Options: SearchOptions{
SkipRerank: vi.options.DisableErrorBounds,
},
}
searchCtx.Ctx = internal.WithWorkspace(ctx, &searchCtx.Workspace)

// Randomize the vector if required by the quantizer.
tempRandomized := searchCtx.Workspace.AllocVector(vi.quantizer.GetRandomDims())
defer searchCtx.Workspace.FreeVector(tempRandomized)
vi.quantizer.RandomizeVector(ctx, vector, tempRandomized, false /* invert */)
searchCtx.Randomized = tempRandomized

searchSet := vecstore.SearchSet{MaxResults: 1, MatchKey: key}

// Search with the base beam size. If that fails to find the vector, try again
// with a larger beam size, in order to minimize the chance of dangling
// vector references in the index.
baseBeamSize := max(vi.options.BaseBeamSize, 1)
for {
searchCtx.Options.BaseBeamSize = baseBeamSize

err := vi.searchHelper(&searchCtx, &searchSet, true /* allowRetry */)
if err != nil {
return err
}
results := searchSet.PopResults()
if len(results) == 0 {
// Retry search with significantly higher beam size.
if baseBeamSize == vi.options.BaseBeamSize {
baseBeamSize *= 8
continue
}
return nil
}

// Remove the vector from its partition in the store.
_, err = vi.removeFromPartition(ctx, txn, results[0].ParentPartitionKey, results[0].ChildKey)
return err
}
}

// Search finds vectors in the index that are closest to the given query vector
// and returns them in the search set. Set searchSet.MaxResults to limit the
// number of results. This is called within the scope of a transaction so that
Expand Down
57 changes: 39 additions & 18 deletions pkg/sql/vecindex/vector_index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,31 +262,52 @@ func (s *testState) Delete(d *datadriven.TestData) string {
}
}

txn := beginTransaction(s.Ctx, s.T, s.InMemStore)
defer commitTransaction(s.Ctx, s.T, s.InMemStore, txn)
for i, line := range strings.Split(d.Input, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}

// Get root in order to acquire partition lock.
_, err := s.InMemStore.GetPartition(s.Ctx, txn, vecstore.RootKey)
require.NoError(s.T, err)
// If vector to delete has a colon, then its value is specified as well
// as its name. This is useful for forcing a certain value to delete.
var key vecstore.PrimaryKey
var vec vector.T
parts := strings.Split(line, ":")
if len(parts) == 1 {
// Get the value from the store.
key = vecstore.PrimaryKey(line)
vec = s.InMemStore.GetVector(key)
} else {
require.Len(s.T, parts, 2)
// Parse the value after the colon.
key = vecstore.PrimaryKey(parts[0])
vec = s.parseVector(parts[1])
}

if notFound {
for _, line := range strings.Split(d.Input, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
// Delete within the scope of a transaction.
txn := beginTransaction(s.Ctx, s.T, s.InMemStore)

// Simulate case where the vector is deleted in the primary index, but
// it cannot be found in the secondary index.
s.InMemStore.DeleteVector(txn, []byte(line))
// If notFound=true, then simulate case where the vector is deleted in
// the primary index, but it cannot be found in the secondary index.
if !notFound {
err := s.Index.Delete(s.Ctx, txn, vec, key)
require.NoError(s.T, err)
}
s.InMemStore.DeleteVector(txn, key)

commitTransaction(s.Ctx, s.T, s.InMemStore, txn)

if (i+1)%s.Options.MaxPartitionSize == 0 {
// Periodically, run synchronous fixups so that test results are
// deterministic.
require.NoError(s.T, s.Index.fixups.runAll(s.Ctx))
}
}

// TODO(andyk): Add code to delete vector from index.
// Handle any remaining fixups.
require.NoError(s.T, s.Index.fixups.runAll(s.Ctx))

tree, err := s.Index.Format(s.Ctx, txn, FormatOptions{PrimaryKeyStrings: true})
require.NoError(s.T, err)
return tree
return s.FormatTree(d)
}

// parseVector parses a vector string in this form: (1.5, 6, -4).
Expand Down

0 comments on commit 8233287

Please sign in to comment.