From 229ce2d110e48faeb827273f230c98240cd194e1 Mon Sep 17 00:00:00 2001 From: carlosdelest Date: Mon, 2 Dec 2024 15:49:42 +0100 Subject: [PATCH] Add YAML tests --- .../search.vectors/41_knn_search_bbq_hnsw.yml | 80 +++++++++++++++++++ .../41_knn_search_byte_quantized.yml | 79 ++++++++++++++++++ .../41_knn_search_half_byte_quantized.yml | 78 ++++++++++++++++++ .../search.vectors/42_knn_search_bbq_flat.yml | 80 +++++++++++++++++++ .../42_knn_search_int4_flat.yml | 77 ++++++++++++++++++ .../42_knn_search_int8_flat.yml | 77 ++++++++++++++++++ 6 files changed, 471 insertions(+) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_bbq_hnsw.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_bbq_hnsw.yml index 188c155e4a836..31f934d58b7ec 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_bbq_hnsw.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_bbq_hnsw.yml @@ -87,6 +87,86 @@ setup: # here we verify that are last hit is always the worst one - match: { hits.hits.2._id: "1" } +--- +"Vector rescoring has similar ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: bbq_hnsw + body: + knn: + field: vector + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + - set: { hits.hits.0._score: knn_score0 } + - set: { hits.hits.1._score: knn_score1 } + - set: { hits.hits.2._score: knn_score2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: bbq_hnsw + body: + knn: + field: vector + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + rescore: + oversample: 1.5 + + # Comparing to knn search, we already have changes in ordering and scoring + - match: { hits.hits.0._id: $knn_id1 } + - match: { hits.hits.1._id: $knn_id0 } + - match: { hits.hits.2._id: $knn_id2 } + + # Get rescoring scores + - match: { hits.total: 3 } + - set: { hits.hits.0._id: rescore_id0 } + - set: { hits.hits.1._id: rescore_id1 } + - set: { hits.hits.2._id: rescore_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + # Exact knn via script score + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0 ] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $rescore_id0 } + - match: { hits.hits.1._id: $rescore_id1 } + - match: { hits.hits.2._id: $rescore_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } + --- "Test bad quantization parameters": - do: diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_byte_quantized.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_byte_quantized.yml index b7a5517309949..2bd4f97614390 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_byte_quantized.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_byte_quantized.yml @@ -368,6 +368,85 @@ setup: - match: {hits.hits.2._id: "1"} - gte: {hits.hits.2._score: 0.78} - lte: {hits.hits.2._score: 0.791} + +--- +# Won't be true for larger datasets, but this helps checking kNN vs rescoring vs exact search +"Vector rescoring has same ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + + # kNN search + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: hnsw_byte_quantized + body: + size: 3 + query: + knn: + k: 3 + num_candidates: 3 + field: vector + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: hnsw_byte_quantized + body: + size: 3 + query: + knn: + k: 3 + num_candidates: 3 + field: vector + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + rescore: + oversample: 1.5 + + # Check same ordering (which will not be true for larger datasets) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $knn_id0 } + - match: { hits.hits.1._id: $knn_id1 } + - match: { hits.hits.2._id: $knn_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $knn_id0 } + - match: { hits.hits.1._id: $knn_id1 } + - match: { hits.hits.2._id: $knn_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } + --- "Test bad quantization parameters": - do: diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_half_byte_quantized.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_half_byte_quantized.yml index 5f1af2ca5c52f..ecf9d08396f30 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_half_byte_quantized.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/41_knn_search_half_byte_quantized.yml @@ -549,6 +549,84 @@ setup: - match: { hits.hits.1._id: "2"} - match: { hits.hits.2._id: "3"} --- +"Vector rescoring has same ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: hnsw_byte_quantized + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8] + k: 3 + num_candidates: 3 + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + index: hnsw_byte_quantized + rest_total_hits_as_int: true + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8] + k: 3 + num_candidates: 3 + rescore: + oversample: 1.5 + + # Comparing to knn search + - match: { hits.hits.0._id: $knn_id0 } + - match: { hits.hits.1._id: $knn_id1 } + - match: { hits.hits.2._id: $knn_id2 } + + # Get rescoring scores + - match: { hits.total: 3 } + - set: { hits.hits.0._id: rescore_id0 } + - set: { hits.hits.1._id: rescore_id1 } + - set: { hits.hits.2._id: rescore_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [-0.5, 90.0, -10, 14.8] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $rescore_id0 } + - match: { hits.hits.1._id: $rescore_id1 } + - match: { hits.hits.2._id: $rescore_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } + +--- "Test odd dimensions fail indexing": - do: catch: bad_request diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_bbq_flat.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_bbq_flat.yml index ed7a8dd5df65d..960dbeb786c9a 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_bbq_flat.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_bbq_flat.yml @@ -87,6 +87,86 @@ setup: # here we verify that are last hit is always the worst one - match: { hits.hits.2._id: "1" } --- +"Vector rescoring has similar ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: bbq_flat + body: + knn: + field: vector + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + - set: { hits.hits.0._score: knn_score0 } + - set: { hits.hits.1._score: knn_score1 } + - set: { hits.hits.2._score: knn_score2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: bbq_flat + body: + knn: + field: vector + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + rescore: + oversample: 1.5 + + # Comparing to knn search, we already have changes in ordering and scoring + - match: { hits.hits.0._id: $knn_id1 } + - match: { hits.hits.1._id: $knn_id0 } + - match: { hits.hits.2._id: $knn_id2 } + + # Get rescoring scores + - match: { hits.total: 3 } + - set: { hits.hits.0._id: rescore_id0 } + - set: { hits.hits.1._id: rescore_id1 } + - set: { hits.hits.2._id: rescore_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + # Exact knn via script score + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [ 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0, -0.5, 90.0, -10, 14.8, -156.0] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $rescore_id0 } + - match: { hits.hits.1._id: $rescore_id1 } + - match: { hits.hits.2._id: $rescore_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } + +--- "Test bad parameters": - do: catch: bad_request diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int4_flat.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int4_flat.yml index b9a0b16f2bd7a..4661510780a5f 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int4_flat.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int4_flat.yml @@ -344,3 +344,80 @@ setup: index: dynamic_dim_hnsw_quantized body: vector: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] +--- +"Vector rescoring has same ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: int4_flat + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8] + k: 3 + num_candidates: 3 + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + index: int4_flat + rest_total_hits_as_int: true + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8] + k: 3 + num_candidates: 3 + rescore: + oversample: 1.5 + + # Comparing to knn search + - match: { hits.hits.0._id: $knn_id0 } + - match: { hits.hits.1._id: $knn_id1 } + - match: { hits.hits.2._id: $knn_id2 } + + # Get rescoring scores + - match: { hits.total: 3 } + - set: { hits.hits.0._id: rescore_id0 } + - set: { hits.hits.1._id: rescore_id1 } + - set: { hits.hits.2._id: rescore_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [-0.5, 90.0, -10, 14.8] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $rescore_id0 } + - match: { hits.hits.1._id: $rescore_id1 } + - match: { hits.hits.2._id: $rescore_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int8_flat.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int8_flat.yml index 139747c5e7ee5..2e02e9063e4da 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int8_flat.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/42_knn_search_int8_flat.yml @@ -262,6 +262,83 @@ setup: - gte: {hits.hits.2._score: 0.78} - lte: {hits.hits.2._score: 0.791} --- +"Vector rescoring has same ordering as knn, same scoring as exact search for kNN section": + - skip: + features: "headers" + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + index: int8_flat + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + + - match: { hits.total: 3 } + - set: { hits.hits.0._id: knn_id0 } + - set: { hits.hits.1._id: knn_id1 } + - set: { hits.hits.2._id: knn_id2 } + + # Rescore + - do: + headers: + Content-Type: application/json + search: + index: int8_flat + rest_total_hits_as_int: true + body: + fields: [ "name" ] + knn: + field: vector + query_vector: [-0.5, 90.0, -10, 14.8, -156.0] + k: 3 + num_candidates: 3 + rescore: + oversample: 1.5 + + # Comparing to knn search + - match: { hits.hits.0._id: $knn_id0 } + - match: { hits.hits.1._id: $knn_id1 } + - match: { hits.hits.2._id: $knn_id2 } + + # Get rescoring scores + - match: { hits.total: 3 } + - set: { hits.hits.0._id: rescore_id0 } + - set: { hits.hits.1._id: rescore_id1 } + - set: { hits.hits.2._id: rescore_id2 } + - set: { hits.hits.0._score: rescore_score0 } + - set: { hits.hits.1._score: rescore_score1 } + - set: { hits.hits.2._score: rescore_score2 } + + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "1.0 / (1.0 + Math.pow(l2norm(params.query_vector, 'vector'), 2.0))" + params: + query_vector: [-0.5, 90.0, -10, 14.8, -156.0] + + # Check same ordering (which will not be true for larger datasets) + # and scoring (which should be for the elements that are present in both) + - match: { hits.total: 3 } + - match: { hits.hits.0._id: $rescore_id0 } + - match: { hits.hits.1._id: $rescore_id1 } + - match: { hits.hits.2._id: $rescore_id2 } + - match: { hits.hits.0._score: $rescore_score0 } + - match: { hits.hits.1._score: $rescore_score1 } + - match: { hits.hits.2._score: $rescore_score2 } +--- "Test bad parameters": - do: catch: bad_request