Skip to content

Commit

Permalink
Merge #135004
Browse files Browse the repository at this point in the history
135004: vecindex: implement C-SPANN search r=mw5h a=andy-kimball

Add the VectorIndex class, which will implement the C-SPANN algorithm, which adapts Microsoft’s SPANN and SPFresh algorithms to work well with CockroachDB’s unique distributed architecture. This PR implements K-means tree search with a test implementation of bottom-up tree construction. Later PR's will include code to incrementally build the tree.

Epic: CRDB-42943

Release note: None

Co-authored-by: Andrew Kimball <[email protected]>
  • Loading branch information
craig[bot] and andy-kimball committed Nov 13, 2024
2 parents 29b34ea + ac81c4a commit f60d2d3
Show file tree
Hide file tree
Showing 14 changed files with 1,641 additions and 53 deletions.
16 changes: 14 additions & 2 deletions pkg/sql/vecindex/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ filegroup(

go_library(
name = "vecindex",
srcs = ["kmeans.go"],
srcs = [
"kmeans.go",
"vector_index.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/sql/vecindex",
visibility = ["//visibility:public"],
deps = [
"//pkg/sql/vecindex/internal",
"//pkg/sql/vecindex/quantize",
"//pkg/sql/vecindex/vecstore",
"//pkg/util/num32",
"//pkg/util/vector",
"@com_github_cockroachdb_errors//:errors",
Expand All @@ -21,14 +26,21 @@ go_library(

go_test(
name = "vecindex_test",
srcs = ["kmeans_test.go"],
srcs = [
"kmeans_test.go",
"vector_index_test.go",
],
data = glob(["testdata/**"]),
embed = [":vecindex"],
deps = [
"//pkg/sql/vecindex/internal",
"//pkg/sql/vecindex/quantize",
"//pkg/sql/vecindex/testutils",
"//pkg/sql/vecindex/vecstore",
"//pkg/util/num32",
"//pkg/util/vector",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
"@org_gonum_v1_gonum//floats/scalar",
"@org_gonum_v1_gonum//stat",
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/vecindex/quantize/rabitq_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,10 @@ func TestRaBitQuantizerEdge(t *testing.T) {

vectors := vector.MakeSet(141)
vectors.AddUndefined(2)
zeros := vectors.At(0)
ones := vectors.At(1)
for i := 0; i < len(ones); i++ {
zeros[i] = 0
ones[i] = 1
}
quantizedSet := quantizer.Quantize(ctx, &vectors).(*RaBitQuantizedVectorSet)
Expand Down
90 changes: 90 additions & 0 deletions pkg/sql/vecindex/testdata/delete.ddt
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ----------
# Test deleting vectors from primary index, but not from secondary index.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
vec2: (7, 4)
vec3: (4, 3)
vec4: (5, 5)
----
• 1 (3.1667, 3)
├───• 2 (5.3333, 4)
│ │
│ ├───• vec2 (7, 4)
│ ├───• vec3 (4, 3)
│ └───• vec4 (5, 5)
└───• 3 (1, 2)
└───• vec1 (1, 2)

# Delete vector from primary index, but not from secondary index.
delete not-found
vec3
----
• 1 (3.1667, 3)
├───• 2 (5.3333, 4)
│ │
│ ├───• vec2 (7, 4)
│ ├───• vec3 (MISSING)
│ └───• vec4 (5, 5)
└───• 3 (1, 2)
└───• vec1 (1, 2)

# Ensure deleted vector is not returned by search. This should enqueue a fixup
# that removes the vector from the index.
search max-results=1
(4, 3)
----
vec4: 5 (centroid=1.0541)
4 leaf vectors, 6 vectors, 2 full vectors, 3 partitions

# Again, with higher max results.
search max-results=2
(4, 3)
----
vec4: 5 (centroid=1.0541)
vec2: 10 (centroid=1.6667)
4 leaf vectors, 6 vectors, 4 full vectors, 3 partitions

# Vector should now be gone from the index.
# TODO(andyk): This will be true once fixups are added.
format-tree
----
• 1 (3.1667, 3)
├───• 2 (5.3333, 4)
│ │
│ ├───• vec2 (7, 4)
│ ├───• vec3 (MISSING)
│ └───• vec4 (5, 5)
└───• 3 (1, 2)
└───• vec1 (1, 2)

# Delete all vectors from one branch of the tree.
delete not-found
vec1
----
• 1 (3.1667, 3)
├───• 2 (5.3333, 4)
│ │
│ ├───• vec2 (7, 4)
│ ├───• vec3 (MISSING)
│ └───• vec4 (5, 5)
└───• 3 (1, 2)
└───• vec1 (MISSING)

# Search the empty branch.
search max-results=1 beam-size=1
(1, 2)
----
1 leaf vectors, 3 vectors, 1 full vectors, 2 partitions
72 changes: 72 additions & 0 deletions pkg/sql/vecindex/testdata/search-features.ddt
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Load 500 512-dimension features and search them. Use small partition size to
# ensure a deeper tree.

new-index dims=512 min-partition-size=2 max-partition-size=8 quality-samples=4 beam-size=2 load-features=500 hide-tree
----
Created index with 500 vectors with 512 dimensions.

# Start with 1 result and default beam size of 2.
search max-results=1 use-feature=9999
----
vec441: 0.4646 (centroid=0.382)
9 leaf vectors, 39 vectors, 2 full vectors, 5 partitions

# Search for additional results.
search max-results=3 use-feature=9999
----
vec441: 0.4646 (centroid=0.382)
vec99: 0.6356 (centroid=0.382)
vec296: 0.7638 (centroid=0.5962)
9 leaf vectors, 39 vectors, 6 full vectors, 5 partitions

# Use a larger beam size.
search max-results=6 use-feature=9999 beam-size=8
----
vec74: 0.4155 (centroid=0.5092)
vec195: 0.4359 (centroid=0.5127)
vec441: 0.4646 (centroid=0.382)
vec77: 0.4894 (centroid=0.4286)
vec355: 0.5821 (centroid=0.4617)
vec328: 0.6032 (centroid=0.5276)
58 leaf vectors, 123 vectors, 14 full vectors, 15 partitions

# Turn off re-ranking, which results in increased inaccuracy.
search max-results=6 use-feature=9999 beam-size=8 skip-rerank
----
vec195: 0.4179 ±0.0264 (centroid=0.5127)
vec74: 0.4322 ±0.0263 (centroid=0.5092)
vec441: 0.4657 ±0.0215 (centroid=0.382)
vec77: 0.4881 ±0.0221 (centroid=0.4286)
vec355: 0.5658 ±0.0238 (centroid=0.4617)
vec415: 0.6142 ±0.0302 (centroid=0.5306)
58 leaf vectors, 123 vectors, 0 full vectors, 15 partitions

# Return top 25 results.
search max-results=25 use-feature=9999 beam-size=8
----
vec74: 0.4155 (centroid=0.5092)
vec195: 0.4359 (centroid=0.5127)
vec441: 0.4646 (centroid=0.382)
vec77: 0.4894 (centroid=0.4286)
vec355: 0.5821 (centroid=0.4617)
vec328: 0.6032 (centroid=0.5276)
vec389: 0.6183 (centroid=0.5267)
vec415: 0.6298 (centroid=0.5306)
vec99: 0.6356 (centroid=0.382)
vec267: 0.6742 (centroid=0.526)
vec6: 0.685 (centroid=0.6015)
vec485: 0.6867 (centroid=0.362)
vec236: 0.687 (centroid=0.5071)
vec198: 0.6885 (centroid=0.5094)
vec65: 0.6898 (centroid=0.4403)
vec146: 0.6901 (centroid=0.5601)
vec282: 0.7197 (centroid=0.4023)
vec410: 0.728 (centroid=0.4261)
vec356: 0.7341 (centroid=0.4352)
vec439: 0.7428 (centroid=0.6023)
vec116: 0.7462 (centroid=0.4643)
vec273: 0.7555 (centroid=0.5226)
vec453: 0.7735 (centroid=0.3571)
vec233: 0.7737 (centroid=0.5502)
vec331: 0.7793 (centroid=0.4871)
58 leaf vectors, 123 vectors, 44 full vectors, 15 partitions
Loading

0 comments on commit f60d2d3

Please sign in to comment.