diff --git a/pkg/sql/vecindex/testdata/delete.ddt b/pkg/sql/vecindex/testdata/delete.ddt index b96e2965c534..9fbe9521cdee 100644 --- a/pkg/sql/vecindex/testdata/delete.ddt +++ b/pkg/sql/vecindex/testdata/delete.ddt @@ -1,5 +1,141 @@ # ---------- -# Test deleting vectors from primary index, but not from secondary index. +# Construct new index with one vector in the root. +# ---------- +new-index min-partition-size=1 max-partition-size=3 beam-size=2 +vec1: (1, 2) +---- +• 1 (0, 0) +│ +└───• vec1 (1, 2) + +# Delete remaining vector in the root. +delete +vec1 +---- +• 1 (0, 0) + +# ---------- +# Construct new index with only duplicate vectors. +# ---------- +new-index min-partition-size=1 max-partition-size=3 beam-size=2 +vec1: (1, 2) +vec2: (1, 2) +vec3: (1, 2) +vec4: (1, 2) +vec5: (1, 2) +vec6: (1, 2) +---- +• 1 (1, 2) +│ +├───• 2 (1, 2) +│ │ +│ ├───• vec1 (1, 2) +│ └───• vec2 (1, 2) +│ +└───• 3 (1, 2) + │ + ├───• vec3 (1, 2) + ├───• vec4 (1, 2) + ├───• vec5 (1, 2) + └───• vec6 (1, 2) + +# Ensure the correct duplicates are deleted (i.e. with matching keys). +delete +vec1 +vec5 +---- +• 1 (1, 2) +│ +├───• 2 (1, 2) +│ │ +│ └───• vec2 (1, 2) +│ +└───• 3 (1, 2) + │ + ├───• vec3 (1, 2) + ├───• vec4 (1, 2) + └───• vec6 (1, 2) + +# ---------- +# Construct new index with multiple levels. +# ---------- +new-index min-partition-size=1 max-partition-size=3 beam-size=1 +vec1: (1, 2) +vec2: (7, 4) +vec3: (4, 3) +vec4: (-4, 5) +vec5: (1, 11) +vec6: (1, -6) +vec7: (0, 4) +vec8: (-2, 8) +vec9: (2, 8) +---- +• 1 (1.5, 1.875) +│ +├───• 2 (1, -2) +│ │ +│ ├───• vec1 (1, 2) +│ └───• vec6 (1, -6) +│ +├───• 4 (1.75, 4) +│ │ +│ ├───• vec3 (4, 3) +│ ├───• vec4 (-4, 5) +│ ├───• vec7 (0, 4) +│ └───• vec2 (7, 4) +│ +└───• 5 (0.3333, 9) + │ + ├───• vec5 (1, 11) + ├───• vec8 (-2, 8) + └───• vec9 (2, 8) + +# Test case where initial search fails to find vector to delete and it must be +# retried. +delete +vec1: (0, 8) +---- +• 1 (1.5, 1.875) +│ +├───• 2 (1, -2) +│ │ +│ └───• vec6 (1, -6) +│ +├───• 4 (1.75, 4) +│ │ +│ ├───• vec3 (4, 3) +│ ├───• vec4 (-4, 5) +│ ├───• vec7 (0, 4) +│ └───• vec2 (7, 4) +│ +└───• 5 (0.3333, 9) + │ + ├───• vec5 (1, 11) + ├───• vec8 (-2, 8) + └───• vec9 (2, 8) + +# Delete multiple vectors. +delete +vec4 +vec5 +vec6 +---- +• 1 (1.5, 1.875) +│ +├───• 2 (1, -2) +├───• 4 (1.75, 4) +│ │ +│ ├───• vec3 (4, 3) +│ ├───• vec2 (7, 4) +│ └───• vec7 (0, 4) +│ +└───• 5 (0.3333, 9) + │ + ├───• vec9 (2, 8) + └───• vec8 (-2, 8) + +# ---------- +# Construct new index with multiple levels. # ---------- new-index min-partition-size=1 max-partition-size=3 beam-size=2 vec1: (1, 2) diff --git a/pkg/sql/vecindex/testdata/insert.ddt b/pkg/sql/vecindex/testdata/insert.ddt index b93b5ac41cc5..a58c198709f7 100644 --- a/pkg/sql/vecindex/testdata/insert.ddt +++ b/pkg/sql/vecindex/testdata/insert.ddt @@ -1,3 +1,6 @@ +# ---------- +# Construct empty index. +# ---------- new-index min-partition-size=1 max-partition-size=4 beam-size=2 ---- • 1 (0, 0) diff --git a/pkg/sql/vecindex/vecstore/in_memory_store.go b/pkg/sql/vecindex/vecstore/in_memory_store.go index cf86eaf2b3f4..38ee2a3ee5cd 100644 --- a/pkg/sql/vecindex/vecstore/in_memory_store.go +++ b/pkg/sql/vecindex/vecstore/in_memory_store.go @@ -352,6 +352,15 @@ func (s *InMemoryStore) DeleteVector(txn Txn, key PrimaryKey) { delete(s.mu.vectors, string(key)) } +// GetVector returns a single vector from the store, by its primary key. This +// is used for testing. +func (s *InMemoryStore) GetVector(key PrimaryKey) vector.T { + s.mu.Lock() + defer s.mu.Unlock() + + return s.mu.vectors[string(key)] +} + // GetAllVectors returns all vectors that have been added to the store as key // and vector pairs. This is used for testing. func (s *InMemoryStore) GetAllVectors() []VectorWithKey { diff --git a/pkg/sql/vecindex/vecstore/search_set.go b/pkg/sql/vecindex/vecstore/search_set.go index fc3ba7870fe5..94abd2050a67 100644 --- a/pkg/sql/vecindex/vecstore/search_set.go +++ b/pkg/sql/vecindex/vecstore/search_set.go @@ -6,6 +6,7 @@ package vecstore import ( + "bytes" "sort" "github.com/cockroachdb/cockroach/pkg/util/container/heap" @@ -168,6 +169,10 @@ type SearchSet struct { // among the best results. MaxExtraResults int + // MatchKey, if non-nil, filters out all search candidates that do not have + // a matching primary key. + MatchKey PrimaryKey + // Stats tracks useful information about the search, such as how many vectors // and partitions were scanned. Stats SearchStats @@ -180,6 +185,11 @@ type SearchSet struct { // Add includes a new candidate in the search set. If set limits have been // reached, then the candidate with the farthest distance will be discarded. func (ss *SearchSet) Add(candidate *SearchResult) { + if ss.MatchKey != nil && !bytes.Equal(ss.MatchKey, candidate.ChildKey.PrimaryKey) { + // Filter out candidates without a matching primary key. + return + } + // Fast path where no pruning is necessary. if len(ss.results) < ss.MaxResults { heap.Push[*SearchResult](&ss.results, candidate) diff --git a/pkg/sql/vecindex/vecstore/search_set_test.go b/pkg/sql/vecindex/vecstore/search_set_test.go index c5751d7a29fb..8c08e6f6c07d 100644 --- a/pkg/sql/vecindex/vecstore/search_set_test.go +++ b/pkg/sql/vecindex/vecstore/search_set_test.go @@ -132,4 +132,9 @@ func TestSearchSet(t *testing.T) { otherSet.MaxExtraResults = 1 otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7}) require.Equal(t, SearchResults{result3, result1, result4, result7}, otherSet.PopResults()) + + // Ignore results without a matching primary key. + otherSet = SearchSet{MaxResults: 2, MatchKey: []byte{60}} + otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7}) + require.Equal(t, SearchResults{result6}, otherSet.PopResults()) } diff --git a/pkg/sql/vecindex/vector_index.go b/pkg/sql/vecindex/vector_index.go index 71364cc21a1b..cc560d909d33 100644 --- a/pkg/sql/vecindex/vector_index.go +++ b/pkg/sql/vecindex/vector_index.go @@ -209,6 +209,63 @@ func (vi *VectorIndex) Insert( return vi.insertHelper(&parentSearchCtx, childKey, true /* allowRetry */) } +// Delete attempts to remove a vector from the index, given its value and +// primary key. This is called within the scope of a transaction so that the +// index does not appear to change during the delete. +// +// NOTE: Delete may not be able to locate the vector in the index, meaning a +// "dangling vector" reference will be left in the tree. Vector index methods +// handle this rare case by checking for duplicates when returning search +// results. +func (vi *VectorIndex) Delete( + ctx context.Context, txn vecstore.Txn, vector vector.T, key vecstore.PrimaryKey, +) error { + // Search for the vector in the index. + searchCtx := searchContext{ + Txn: txn, + Original: vector, + Level: vecstore.LeafLevel, + Options: SearchOptions{ + SkipRerank: vi.options.DisableErrorBounds, + }, + } + searchCtx.Ctx = internal.WithWorkspace(ctx, &searchCtx.Workspace) + + // Randomize the vector if required by the quantizer. + tempRandomized := searchCtx.Workspace.AllocVector(vi.quantizer.GetRandomDims()) + defer searchCtx.Workspace.FreeVector(tempRandomized) + vi.quantizer.RandomizeVector(ctx, vector, tempRandomized, false /* invert */) + searchCtx.Randomized = tempRandomized + + searchSet := vecstore.SearchSet{MaxResults: 1, MatchKey: key} + + // Search with the base beam size. If that fails to find the vector, try again + // with a larger beam size, in order to minimize the chance of dangling + // vector references in the index. + baseBeamSize := max(vi.options.BaseBeamSize, 1) + for { + searchCtx.Options.BaseBeamSize = baseBeamSize + + err := vi.searchHelper(&searchCtx, &searchSet, true /* allowRetry */) + if err != nil { + return err + } + results := searchSet.PopResults() + if len(results) == 0 { + // Retry search with significantly higher beam size. + if baseBeamSize == vi.options.BaseBeamSize { + baseBeamSize *= 8 + continue + } + return nil + } + + // Remove the vector from its partition in the store. + _, err = vi.removeFromPartition(ctx, txn, results[0].ParentPartitionKey, results[0].ChildKey) + return err + } +} + // Search finds vectors in the index that are closest to the given query vector // and returns them in the search set. Set searchSet.MaxResults to limit the // number of results. This is called within the scope of a transaction so that diff --git a/pkg/sql/vecindex/vector_index_test.go b/pkg/sql/vecindex/vector_index_test.go index 75dc76383f19..9ecd499b9671 100644 --- a/pkg/sql/vecindex/vector_index_test.go +++ b/pkg/sql/vecindex/vector_index_test.go @@ -269,31 +269,52 @@ func (s *testState) Delete(d *datadriven.TestData) string { } } - txn := beginTransaction(s.Ctx, s.T, s.InMemStore) - defer commitTransaction(s.Ctx, s.T, s.InMemStore, txn) + for i, line := range strings.Split(d.Input, "\n") { + line = strings.TrimSpace(line) + if len(line) == 0 { + continue + } - // Get root in order to acquire partition lock. - _, err := s.InMemStore.GetPartition(s.Ctx, txn, vecstore.RootKey) - require.NoError(s.T, err) + // If vector to delete has a colon, then its value is specified as well + // as its name. This is useful for forcing a certain value to delete. + var key vecstore.PrimaryKey + var vec vector.T + parts := strings.Split(line, ":") + if len(parts) == 1 { + // Get the value from the store. + key = vecstore.PrimaryKey(line) + vec = s.InMemStore.GetVector(key) + } else { + require.Len(s.T, parts, 2) + // Parse the value after the colon. + key = vecstore.PrimaryKey(parts[0]) + vec = s.parseVector(parts[1]) + } - if notFound { - for _, line := range strings.Split(d.Input, "\n") { - line = strings.TrimSpace(line) - if len(line) == 0 { - continue - } + // Delete within the scope of a transaction. + txn := beginTransaction(s.Ctx, s.T, s.InMemStore) - // Simulate case where the vector is deleted in the primary index, but - // it cannot be found in the secondary index. - s.InMemStore.DeleteVector(txn, []byte(line)) + // If notFound=true, then simulate case where the vector is deleted in + // the primary index, but it cannot be found in the secondary index. + if !notFound { + err := s.Index.Delete(s.Ctx, txn, vec, key) + require.NoError(s.T, err) + } + s.InMemStore.DeleteVector(txn, key) + + commitTransaction(s.Ctx, s.T, s.InMemStore, txn) + + if (i+1)%s.Options.MaxPartitionSize == 0 { + // Periodically, run synchronous fixups so that test results are + // deterministic. + require.NoError(s.T, s.Index.fixups.runAll(s.Ctx)) } } - // TODO(andyk): Add code to delete vector from index. + // Handle any remaining fixups. + require.NoError(s.T, s.Index.fixups.runAll(s.Ctx)) - tree, err := s.Index.Format(s.Ctx, txn, FormatOptions{PrimaryKeyStrings: true}) - require.NoError(s.T, err) - return tree + return s.FormatTree(d) } func (s *testState) Recall(d *datadriven.TestData) string {