From b916a817c21acef04fdba09cb70977af001a7808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Tue, 12 Mar 2024 21:34:22 +0100 Subject: [PATCH 1/2] Add basic query benchmark - Normalized vectors - Only query similarity - No metadata or content filter --- collection_test.go | 89 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/collection_test.go b/collection_test.go index 1be172f..5593b80 100644 --- a/collection_test.go +++ b/collection_test.go @@ -2,7 +2,10 @@ package chromem import ( "context" + "errors" + "math/rand" "slices" + "strconv" "testing" ) @@ -335,3 +338,89 @@ func TestCollection_Count(t *testing.T) { t.Fatal("expected 2, got", c.Count()) } } + +// Global var for assignment in the benchmark to avoid compiler optimizations. +var globalRes []Result + +func BenchmarkCollection_Query_100(b *testing.B) { + benchmarkCollection_Query(b, 100) +} + +func BenchmarkCollection_Query_1000(b *testing.B) { + benchmarkCollection_Query(b, 1000) +} + +func BenchmarkCollection_Query_5000(b *testing.B) { + benchmarkCollection_Query(b, 5000) +} + +func BenchmarkCollection_Query_25000(b *testing.B) { + benchmarkCollection_Query(b, 25000) +} + +func BenchmarkCollection_Query_100000(b *testing.B) { + benchmarkCollection_Query(b, 100_000) +} + +// n is number of documents in the collection +func benchmarkCollection_Query(b *testing.B, n int) { + ctx := context.Background() + + // Seed to make deterministic + r := rand.New(rand.NewSource(42)) + + d := 1536 // dimensions, same as text-embedding-3-small + // Random query vector + qv := make([]float32, d) + for j := 0; j < d; j++ { + qv[j] = r.Float32() + } + // Most embeddings are normalized, so we normalize this one too + qv = normalizeVector(qv) + embeddingFunc := func(_ context.Context, text string) ([]float32, error) { + if text != "foo" { + return nil, errors.New("embedding func not expected to be called") + } + return qv, nil + } + + // Create collection + db := NewDB() + name := "test" + c, err := db.CreateCollection(name, nil, embeddingFunc) + if err != nil { + b.Fatal("expected no error, got", err) + } + if c == nil { + b.Fatal("expected collection, got nil") + } + + // Add documents + for i := 0; i < n; i++ { + // Random embedding + v := make([]float32, d) + for j := 0; j < d; j++ { + v[j] = r.Float32() + } + v = normalizeVector(v) + + // Add document without metadata or content. + // When providing embeddings, the embedding func is not called. + c.AddDocument(ctx, Document{ + ID: strconv.Itoa(i), + Embedding: v, + }) + } + + b.ResetTimer() + + // Query + var res []Result + for i := 0; i < b.N; i++ { + res, err = c.Query(ctx, "foo", 10, nil, nil) + } + if err != nil { + b.Fatal("expected nil, got", err) + } + globalRes = res +} From 87ad4e94a3156dc7906bf72f55cd67ad6500960c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Tue, 12 Mar 2024 22:41:09 +0100 Subject: [PATCH 2/2] Add benchmark with document content - More relevant for basic chromem-go usage - String copying during querying might be very relevant for performance --- collection_test.go | 59 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/collection_test.go b/collection_test.go index 5593b80..883fdab 100644 --- a/collection_test.go +++ b/collection_test.go @@ -342,28 +342,48 @@ func TestCollection_Count(t *testing.T) { // Global var for assignment in the benchmark to avoid compiler optimizations. var globalRes []Result +func BenchmarkCollection_Query_NoContent_100(b *testing.B) { + benchmarkCollection_Query(b, 100, false) +} + +func BenchmarkCollection_Query_NoContent_1000(b *testing.B) { + benchmarkCollection_Query(b, 1000, false) +} + +func BenchmarkCollection_Query_NoContent_5000(b *testing.B) { + benchmarkCollection_Query(b, 5000, false) +} + +func BenchmarkCollection_Query_NoContent_25000(b *testing.B) { + benchmarkCollection_Query(b, 25000, false) +} + +func BenchmarkCollection_Query_NoContent_100000(b *testing.B) { + benchmarkCollection_Query(b, 100_000, false) +} + func BenchmarkCollection_Query_100(b *testing.B) { - benchmarkCollection_Query(b, 100) + benchmarkCollection_Query(b, 100, true) } func BenchmarkCollection_Query_1000(b *testing.B) { - benchmarkCollection_Query(b, 1000) + benchmarkCollection_Query(b, 1000, true) } func BenchmarkCollection_Query_5000(b *testing.B) { - benchmarkCollection_Query(b, 5000) + benchmarkCollection_Query(b, 5000, true) } func BenchmarkCollection_Query_25000(b *testing.B) { - benchmarkCollection_Query(b, 25000) + benchmarkCollection_Query(b, 25000, true) } func BenchmarkCollection_Query_100000(b *testing.B) { - benchmarkCollection_Query(b, 100_000) + benchmarkCollection_Query(b, 100_000, true) } // n is number of documents in the collection -func benchmarkCollection_Query(b *testing.B, n int) { +func benchmarkCollection_Query(b *testing.B, n int, withContent bool) { ctx := context.Background() // Seed to make deterministic @@ -404,12 +424,19 @@ func benchmarkCollection_Query(b *testing.B, n int) { } v = normalizeVector(v) - // Add document without metadata or content. + // Add document with some metadata and content depending on parameter. // When providing embeddings, the embedding func is not called. - c.AddDocument(ctx, Document{ - ID: strconv.Itoa(i), + is := strconv.Itoa(i) + doc := Document{ + ID: is, + Metadata: map[string]string{"i": is, "foo": "bar" + is}, Embedding: v, - }) + } + if withContent { + // Let's say we embed 500 tokens, that's ~375 words, ~1875 characters + doc.Content = randomString(r, 1875) + } + c.AddDocument(ctx, doc) } b.ResetTimer() @@ -424,3 +451,15 @@ func benchmarkCollection_Query(b *testing.B, n int) { } globalRes = res } + +// randomString returns a random string of length n using lowercase letters and space. +func randomString(r *rand.Rand, n int) string { + // We add 5 spaces to get roughly one space every 5 characters + characters := []rune("abcdefghijklmnopqrstuvwxyz ") + + b := make([]rune, n) + for i := range b { + b[i] = characters[r.Intn(len(characters))] + } + return string(b) +}