From 12d80c61e5a742ae188d9fc78544e1402524be87 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 11 Jul 2024 08:20:27 +0000 Subject: [PATCH 01/17] refactor: deprecated ExactMatchFilter and use MetadataFilter with many operator support --- examples/metadata-filter/simple.ts | 4 +-- .../src/storage/vectorStore/PGVectorStore.ts | 5 ++- .../vectorStore/PineconeVectorStore.ts | 10 ++++-- .../storage/vectorStore/SimpleVectorStore.ts | 29 +++++++++------ .../src/storage/vectorStore/types.ts | 36 +++++++++++++++++-- .../vectorStores/SimpleVectorStore.test.ts | 2 +- 6 files changed, 66 insertions(+), 20 deletions(-) diff --git a/examples/metadata-filter/simple.ts b/examples/metadata-filter/simple.ts index 34226768e3..32d1fe09db 100644 --- a/examples/metadata-filter/simple.ts +++ b/examples/metadata-filter/simple.ts @@ -74,12 +74,12 @@ async function main() { { key: "private", value: "false", - filterType: "ExactMatch", + operator: "==", }, { key: "dogId", value: "3", - filterType: "ExactMatch", + operator: "==", }, ], }, diff --git a/packages/llamaindex/src/storage/vectorStore/PGVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/PGVectorStore.ts index e54c64aa29..468f5966d0 100644 --- a/packages/llamaindex/src/storage/vectorStore/PGVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/PGVectorStore.ts @@ -272,7 +272,10 @@ export class PGVectorStore query.filters?.filters.forEach((filter, index) => { const paramIndex = params.length + 1; whereClauses.push(`metadata->>'${filter.key}' = $${paramIndex}`); - params.push(filter.value); + // TODO: support filter with other operators + if (!Array.isArray(filter.value)) { + params.push(filter.value); + } }); const where = diff --git a/packages/llamaindex/src/storage/vectorStore/PineconeVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/PineconeVectorStore.ts index 81d1efa6cc..50a2a6241d 100644 --- a/packages/llamaindex/src/storage/vectorStore/PineconeVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/PineconeVectorStore.ts @@ -1,7 +1,7 @@ import { VectorStoreBase, - type ExactMatchFilter, type IEmbedModel, + type MetadataFilter, type MetadataFilters, type VectorStoreNoEmbedModel, type VectorStoreQuery, @@ -199,8 +199,12 @@ export class PineconeVectorStore } toPineconeFilter(stdFilters?: MetadataFilters) { - return stdFilters?.filters?.reduce((carry: any, item: ExactMatchFilter) => { - carry[item.key] = item.value; + return stdFilters?.filters?.reduce((carry: any, item: MetadataFilter) => { + // Use MetadataFilter with EQ operator to replace ExactMatchFilter + // TODO: support filter with other operators + if (item.operator === "==") { + carry[item.key] = item.value; + } return carry; }, {}); } diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index 9a63d74a79..cecf8ee012 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -9,9 +9,11 @@ import { import { exists } from "../FileSystem.js"; import { DEFAULT_PERSIST_DIR } from "../constants.js"; import { + FilterOperator, VectorStoreBase, VectorStoreQueryMode, type IEmbedModel, + type MetadataFilter, type VectorStoreNoEmbedModel, type VectorStoreQuery, type VectorStoreQueryResult, @@ -26,10 +28,12 @@ const LEARNER_MODES = new Set([ const MMR_MODE = VectorStoreQueryMode.MMR; +type MetadataValue = Record; + class SimpleVectorStoreData { embeddingDict: Record = {}; textIdToRefDocId: Record = {}; - metadataDict: Record> = {}; + metadataDict: Record = {}; } export class SimpleVectorStore @@ -105,13 +109,16 @@ export class SimpleVectorStore }> { const items = Object.entries(this.data.embeddingDict); - const metadataLookup = { - ExactMatch: ( - metadata: Record, - key: string, - value: string | number, - ) => { - return String(metadata[key]) === value.toString(); // compare as string + const operatorToFilterFn: Record< + FilterOperator, + ( + input: Omit & { + metadata: MetadataValue; + }, + ) => boolean + > = { + "==": (input) => { + return String(input.metadata[input.key]) === input.value.toString(); // compare as string }, }; @@ -119,13 +126,13 @@ export class SimpleVectorStore if (!query.filters) return true; const filters = query.filters.filters; for (const filter of filters) { - const { key, value, filterType } = filter; - const metadataLookupFn = metadataLookup[filterType]; + const { key, value, operator } = filter; + const metadataLookupFn = operatorToFilterFn[operator]; const metadata = this.data.metadataDict[nodeId]; const isMatch = metadataLookupFn && metadata && - metadataLookupFn(metadata, key, value); + metadataLookupFn({ metadata, key, value }); if (!isMatch) return false; // TODO: handle condition OR AND } return true; diff --git a/packages/llamaindex/src/storage/vectorStore/types.ts b/packages/llamaindex/src/storage/vectorStore/types.ts index 0249cb2754..c38df62fe3 100644 --- a/packages/llamaindex/src/storage/vectorStore/types.ts +++ b/packages/llamaindex/src/storage/vectorStore/types.ts @@ -20,19 +20,51 @@ export enum VectorStoreQueryMode { MMR = "mmr", } +/** + * @deprecated Use MetadataFilter with operator EQ instead + */ export interface ExactMatchFilter { filterType: "ExactMatch"; key: string; value: string | number; } +export enum FilterOperator { + EQ = "==", // default operator (string, number) + // GT = ">", // greater than (number) + // LT = "<", // less than (number) + // NE = "!=", // not equal to (string, number) + // GTE = ">=", // greater than or equal to (number) + // LTE = "<=", // less than or equal to (number) + // IN = "in", // In array (string or number) + // NIN = "nin", // Not in array (string or number) + // ANY = "any", // Contains any (array of strings) + // ALL = "all", // Contains all (array of strings) + // TEXT_MATCH = "text_match", // full text match (allows you to search for a specific substring, token or phrase within the text field) + // CONTAINS = "contains", // metadata array contains value (string or number) +} + +export enum FilterCondition { + AND = "and", + OR = "or", +} + +export type MetadataFilterValue = string | number | string[] | number[]; + +export interface MetadataFilter { + key: string; + value: MetadataFilterValue; + operator: `${FilterOperator}`; // ==, any, all,... +} + export interface MetadataFilters { - filters: ExactMatchFilter[]; + filters: Array; + condition?: `${FilterCondition}`; // and, or } export interface VectorStoreQuerySpec { query: string; - filters: ExactMatchFilter[]; + filters: MetadataFilter[]; topK?: number; } diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 67d91071aa..aa948cfdb3 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -74,7 +74,7 @@ describe("SimpleVectorStore", () => { { key: "private", value: "false", - filterType: "ExactMatch", + operator: "==", }, ], }, From c816d7282a7ce1d6f0c9bac89765446d4ece2e44 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 11 Jul 2024 08:37:22 +0000 Subject: [PATCH 02/17] feat: support IN operator for SimpleVectorStore --- examples/metadata-filter/simple.ts | 26 ++++++++++++++++--- .../storage/vectorStore/SimpleVectorStore.ts | 12 ++++++++- .../src/storage/vectorStore/types.ts | 2 +- .../vectorStores/SimpleVectorStore.test.ts | 20 +++++++++++++- 4 files changed, 54 insertions(+), 6 deletions(-) diff --git a/examples/metadata-filter/simple.ts b/examples/metadata-filter/simple.ts index 32d1fe09db..441fef67e1 100644 --- a/examples/metadata-filter/simple.ts +++ b/examples/metadata-filter/simple.ts @@ -66,7 +66,7 @@ async function main() { console.log("No filter response:", noFilterResponse.toString()); console.log( - "\n=============\nQuerying index with dogId 2. The output always should be red.", + "\n=============\nQuerying index with dogId 2 and private false. The output always should be red.", ); const queryEngineDogId2 = index.asQueryEngine({ preFilters: { @@ -85,10 +85,30 @@ async function main() { }, similarityTopK: 3, }); - const response = await queryEngineDogId2.query({ + const responseEQ = await queryEngineDogId2.query({ query: "What is the color of the dog?", }); - console.log("Filter with dogId 2 response:", response.toString()); + console.log("Filter with dogId 2 response:", responseEQ.toString()); + + console.log( + "\n=============\nQuerying index with dogId IN (1, 3). The output should be brown and red.", + ); + const queryEngineDogId3 = index.asQueryEngine({ + preFilters: { + filters: [ + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + }, + similarityTopK: 3, + }); + const responseIN = await queryEngineDogId3.query({ + query: "What is the color of the dog?", + }); + console.log("Filter with dogId IN (1, 3) response:", responseIN.toString()); } void main(); diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index cecf8ee012..cc57208ade 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -117,9 +117,19 @@ export class SimpleVectorStore }, ) => boolean > = { - "==": (input) => { + [FilterOperator.EQ]: function (input): boolean { return String(input.metadata[input.key]) === input.value.toString(); // compare as string }, + [FilterOperator.IN]: function (input): boolean { + if (!Array.isArray(input.value)) { + throw new Error( + "To use IN, value must be an array of strings or numbers", + ); + } + return input.value + .map(String) + .includes(String(input.metadata[input.key])); + }, }; const queryFilterFn = (nodeId: string) => { diff --git a/packages/llamaindex/src/storage/vectorStore/types.ts b/packages/llamaindex/src/storage/vectorStore/types.ts index c38df62fe3..5e636f7bd0 100644 --- a/packages/llamaindex/src/storage/vectorStore/types.ts +++ b/packages/llamaindex/src/storage/vectorStore/types.ts @@ -31,12 +31,12 @@ export interface ExactMatchFilter { export enum FilterOperator { EQ = "==", // default operator (string, number) + IN = "in", // In array (string or number) // GT = ">", // greater than (number) // LT = "<", // less than (number) // NE = "!=", // not equal to (string, number) // GTE = ">=", // greater than or equal to (number) // LTE = "<=", // less than or equal to (number) - // IN = "in", // In array (string or number) // NIN = "nin", // Not in array (string or number) // ANY = "any", // Contains any (array of strings) // ALL = "all", // Contains all (array of strings) diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index aa948cfdb3..1f410d7182 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -63,7 +63,7 @@ describe("SimpleVectorStore", () => { }); expect(result.similarities).length(3); }); - it("able to query nodes with filter", async () => { + it("able to query nodes with filter EQ", async () => { await store.add(nodes); const result = await store.query({ queryEmbedding: [0.1, 0.2], @@ -81,5 +81,23 @@ describe("SimpleVectorStore", () => { }); expect(result.similarities).length(2); }); + it("able to query nodes with filter IN", async () => { + await store.add(nodes); + const result = await store.query({ + queryEmbedding: [0.1, 0.2], + similarityTopK: 3, + mode: VectorStoreQueryMode.DEFAULT, + filters: { + filters: [ + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + }, + }); + expect(result.similarities).length(2); + }); }); }); From b2f01b7503922732644dd6dbed9b4b35ef9d6381 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 11 Jul 2024 09:40:51 +0000 Subject: [PATCH 03/17] feat: support or and and condition --- examples/metadata-filter/simple.ts | 37 +++++++++++++++++-- .../storage/vectorStore/SimpleVectorStore.ts | 18 +++++---- .../vectorStores/SimpleVectorStore.test.ts | 24 ++++++++++++ 3 files changed, 68 insertions(+), 11 deletions(-) diff --git a/examples/metadata-filter/simple.ts b/examples/metadata-filter/simple.ts index 441fef67e1..245e48c3df 100644 --- a/examples/metadata-filter/simple.ts +++ b/examples/metadata-filter/simple.ts @@ -68,7 +68,7 @@ async function main() { console.log( "\n=============\nQuerying index with dogId 2 and private false. The output always should be red.", ); - const queryEngineDogId2 = index.asQueryEngine({ + const queryEngineEQ = index.asQueryEngine({ preFilters: { filters: [ { @@ -85,7 +85,7 @@ async function main() { }, similarityTopK: 3, }); - const responseEQ = await queryEngineDogId2.query({ + const responseEQ = await queryEngineEQ.query({ query: "What is the color of the dog?", }); console.log("Filter with dogId 2 response:", responseEQ.toString()); @@ -93,7 +93,7 @@ async function main() { console.log( "\n=============\nQuerying index with dogId IN (1, 3). The output should be brown and red.", ); - const queryEngineDogId3 = index.asQueryEngine({ + const queryEngineIN = index.asQueryEngine({ preFilters: { filters: [ { @@ -105,10 +105,39 @@ async function main() { }, similarityTopK: 3, }); - const responseIN = await queryEngineDogId3.query({ + const responseIN = await queryEngineIN.query({ query: "What is the color of the dog?", }); console.log("Filter with dogId IN (1, 3) response:", responseIN.toString()); + + console.log( + "\n=============\nQuerying index with dogId IN (1, 3). The output should be any.", + ); + const queryEngineOR = index.asQueryEngine({ + preFilters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + condition: "or", + }, + similarityTopK: 3, + }); + const responseOR = await queryEngineOR.query({ + query: "What is the color of the dog?", + }); + console.log( + "Filter with dogId with OR operator response:", + responseOR.toString(), + ); } void main(); diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index cc57208ade..b023a8a777 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -134,18 +134,22 @@ export class SimpleVectorStore const queryFilterFn = (nodeId: string) => { if (!query.filters) return true; - const filters = query.filters.filters; - for (const filter of filters) { + const { filters, condition } = query.filters; + const queryCondition = condition || "and"; // default to and + + const queryFilterItemFn = (filter: MetadataFilter) => { const { key, value, operator } = filter; const metadataLookupFn = operatorToFilterFn[operator]; const metadata = this.data.metadataDict[nodeId]; - const isMatch = + return ( metadataLookupFn && metadata && - metadataLookupFn({ metadata, key, value }); - if (!isMatch) return false; // TODO: handle condition OR AND - } - return true; + metadataLookupFn({ metadata, key, value }) + ); + }; + + if (queryCondition === "and") return filters.every(queryFilterItemFn); + return filters.some(queryFilterItemFn); }; const nodeFilterFn = (nodeId: string) => { diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 1f410d7182..0338987c16 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -99,5 +99,29 @@ describe("SimpleVectorStore", () => { }); expect(result.similarities).length(2); }); + it("able to query nodes with filter condition OR", async () => { + await store.add(nodes); + const result = await store.query({ + queryEmbedding: [0.1, 0.2], + similarityTopK: 3, + mode: VectorStoreQueryMode.DEFAULT, + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + condition: "or", + }, + }); + expect(result.similarities).length(3); + }); }); }); From 6735e6489afd00c738dea216827abf268a437e04 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser Date: Thu, 11 Jul 2024 19:01:27 +0700 Subject: [PATCH 04/17] Create famous-poets-hammer.md --- .changeset/famous-poets-hammer.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changeset/famous-poets-hammer.md diff --git a/.changeset/famous-poets-hammer.md b/.changeset/famous-poets-hammer.md new file mode 100644 index 0000000000..7d97257dce --- /dev/null +++ b/.changeset/famous-poets-hammer.md @@ -0,0 +1,6 @@ +--- +"llamaindex": patch +"@llamaindex/llamaindex-test": patch +--- + +Add support for Metadata filters From 7cfedfc61652eecd5b31617f1e642e808d2e7387 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:08:03 +0000 Subject: [PATCH 05/17] feat: add filter and example to use Milvus --- examples/milvus/query.ts | 44 +++++++++++++++---- .../storage/vectorStore/MilvusVectorStore.ts | 39 ++++++++++++++++ 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/examples/milvus/query.ts b/examples/milvus/query.ts index ff33fc69a7..d4f308f514 100644 --- a/examples/milvus/query.ts +++ b/examples/milvus/query.ts @@ -5,18 +5,44 @@ const collectionName = "movie_reviews"; async function main() { try { const milvus = new MilvusVectorStore({ collection: collectionName }); - const index = await VectorStoreIndex.fromVectorStore(milvus); + const retriever = index.asRetriever({ similarityTopK: 20 }); - const retriever = await index.asRetriever({ similarityTopK: 20 }); - - const queryEngine = await index.asQueryEngine({ retriever }); - - const results = await queryEngine.query({ - query: "What is the best reviewed movie?", + console.log("=====\nQuerying the index without any filters."); + const queryEngineNoFilters = index.asQueryEngine({ retriever }); + const resultNoFilter = await queryEngineNoFilters.query({ + query: "Summary movie reviews", }); - - console.log(results.response); + console.log(`Query from ${resultNoFilter.sourceNodes?.length} nodes`); + console.log(resultNoFilter.response); + + console.log("\n=====\nQuerying the index with filters"); + const queryEngineWithFilters = index.asQueryEngine({ + retriever, + preFilters: { + filters: [ + { + key: "doc_id", + value: [ + "./data/movie_reviews.csv_95", + "./data/movie_reviews.csv_101", + ], + operator: "in", + }, + { + key: "document_id", + value: "./data/movie_reviews.csv_37", + operator: "==", + }, + ], + condition: "or", + }, + }); + const resultAfterFilter = await queryEngineWithFilters.query({ + query: "Summary movie reviews", + }); + console.log(`Query from ${resultAfterFilter.sourceNodes?.length} nodes`); + console.log(resultAfterFilter.response); } catch (e) { console.error(e); } diff --git a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts index 42b7df5687..c96d0bc142 100644 --- a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts @@ -11,12 +11,44 @@ import { import { VectorStoreBase, type IEmbedModel, + type MetadataFilters, type VectorStoreNoEmbedModel, type VectorStoreQuery, type VectorStoreQueryResult, } from "./types.js"; import { metadataDictToNode, nodeToMetadata } from "./utils.js"; +function parseScalarFilters(scalarFilters: MetadataFilters): string { + const condition = scalarFilters.condition ?? "and"; + const filters: string[] = []; + + for (const filter of scalarFilters.filters) { + switch (filter.operator) { + case "==": { + if (Array.isArray(filter.value)) { + throw new Error("Operator '==' does not support array value"); + } + const filterValue = String(filter.value); + filters.push(`metadata["${filter.key}"] == "${filterValue}"`); // Eg: metadata["doc_id"] == "./data/movie_reviews.csv_95" + break; + } + case "in": { + if (!Array.isArray(filter.value)) { + throw new Error("Operator 'in' requires array value"); + } + const filterValue = filter.value.map((v) => `"${v}"`).join(", "); + filters.push(`metadata["${filter.key}"] in [${filterValue}]`); // Eg: metadata["doc_id"] in ["./data/movie_reviews.csv_95"] + break; + } + // TODO: Add support for other operators + default: + throw new Error(`Operator ${filter.operator} is not supported.`); + } + } + + return filters.join(` ${condition} `); +} + export class MilvusVectorStore extends VectorStoreBase implements VectorStoreNoEmbedModel @@ -183,6 +215,12 @@ export class MilvusVectorStore }); } + private toMilvusFilter(filters?: MetadataFilters): string | undefined { + if (!filters) return undefined; + // TODO: Milvus also support standard filters, we can add it later + return parseScalarFilters(filters); + } + public async query( query: VectorStoreQuery, _options?: any, @@ -193,6 +231,7 @@ export class MilvusVectorStore collection_name: this.collectionName, limit: query.similarityTopK, vector: query.queryEmbedding, + filter: this.toMilvusFilter(query.filters), }); const nodes: BaseNode[] = []; From af2865765904239e224fee4617cc076964909464 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 12 Jul 2024 07:23:47 +0000 Subject: [PATCH 06/17] refactor: extract build filter function --- .../storage/vectorStore/SimpleVectorStore.ts | 97 +++++++++++-------- .../src/storage/vectorStore/types.ts | 20 ++-- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index b023a8a777..b3b0997930 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -14,6 +14,8 @@ import { VectorStoreQueryMode, type IEmbedModel, type MetadataFilter, + type MetadataFilters, + type MetadataFilterValue, type VectorStoreNoEmbedModel, type VectorStoreQuery, type VectorStoreQueryResult, @@ -30,6 +32,58 @@ const MMR_MODE = VectorStoreQueryMode.MMR; type MetadataValue = Record; +const parsePrimitiveValue = (value: MetadataFilterValue): string | number => { + if (Array.isArray(value)) throw new Error("Value must be a string or number"); + return value.toString(); +}; + +const parseArrayValue = (value: MetadataFilterValue): string[] | number[] => { + if (!Array.isArray(value)) + throw new Error("Value must be an array of strings or numbers"); + return value.map(String); +}; + +const OPERATOR_TO_FILTER_FN: { + [key in FilterOperator]?: ( + input: MetadataFilter & { + metadata: MetadataValue; + }, + ) => boolean; +} = { + [FilterOperator.EQ]: function (input): boolean { + return ( + String(input.metadata[input.key]) === parsePrimitiveValue(input.value) + ); + }, + [FilterOperator.IN]: function (input): boolean { + return !!parseArrayValue(input.value).find( + (i) => i === String(input.metadata[input.key]), + ); + }, +}; + +// Build a filter function based on the metadata and the preFilters +const buildFilterFn = ( + metadata: MetadataValue | undefined, + preFilters: MetadataFilters | undefined, +) => { + if (!preFilters) return true; + if (!metadata) return false; + + const { filters, condition } = preFilters; + const queryCondition = condition || "and"; // default to and + + const itemFilterFn = (filter: MetadataFilter) => { + const { key, value, operator } = filter; + const metadataLookupFn = OPERATOR_TO_FILTER_FN[operator]; + if (!metadataLookupFn) throw new Error(`Unsupported operator: ${operator}`); + return metadataLookupFn({ metadata, key, value, operator }); + }; + + if (queryCondition === "and") return filters.every(itemFilterFn); + return filters.some(itemFilterFn); +}; + class SimpleVectorStoreData { embeddingDict: Record = {}; textIdToRefDocId: Record = {}; @@ -108,48 +162,9 @@ export class SimpleVectorStore embeddings: number[][]; }> { const items = Object.entries(this.data.embeddingDict); - - const operatorToFilterFn: Record< - FilterOperator, - ( - input: Omit & { - metadata: MetadataValue; - }, - ) => boolean - > = { - [FilterOperator.EQ]: function (input): boolean { - return String(input.metadata[input.key]) === input.value.toString(); // compare as string - }, - [FilterOperator.IN]: function (input): boolean { - if (!Array.isArray(input.value)) { - throw new Error( - "To use IN, value must be an array of strings or numbers", - ); - } - return input.value - .map(String) - .includes(String(input.metadata[input.key])); - }, - }; - const queryFilterFn = (nodeId: string) => { - if (!query.filters) return true; - const { filters, condition } = query.filters; - const queryCondition = condition || "and"; // default to and - - const queryFilterItemFn = (filter: MetadataFilter) => { - const { key, value, operator } = filter; - const metadataLookupFn = operatorToFilterFn[operator]; - const metadata = this.data.metadataDict[nodeId]; - return ( - metadataLookupFn && - metadata && - metadataLookupFn({ metadata, key, value }) - ); - }; - - if (queryCondition === "and") return filters.every(queryFilterItemFn); - return filters.some(queryFilterItemFn); + const metadata = this.data.metadataDict[nodeId]; + return buildFilterFn(metadata, query.filters); }; const nodeFilterFn = (nodeId: string) => { diff --git a/packages/llamaindex/src/storage/vectorStore/types.ts b/packages/llamaindex/src/storage/vectorStore/types.ts index 5e636f7bd0..8f990e51d0 100644 --- a/packages/llamaindex/src/storage/vectorStore/types.ts +++ b/packages/llamaindex/src/storage/vectorStore/types.ts @@ -32,16 +32,16 @@ export interface ExactMatchFilter { export enum FilterOperator { EQ = "==", // default operator (string, number) IN = "in", // In array (string or number) - // GT = ">", // greater than (number) - // LT = "<", // less than (number) - // NE = "!=", // not equal to (string, number) - // GTE = ">=", // greater than or equal to (number) - // LTE = "<=", // less than or equal to (number) - // NIN = "nin", // Not in array (string or number) - // ANY = "any", // Contains any (array of strings) - // ALL = "all", // Contains all (array of strings) - // TEXT_MATCH = "text_match", // full text match (allows you to search for a specific substring, token or phrase within the text field) - // CONTAINS = "contains", // metadata array contains value (string or number) + GT = ">", // greater than (number) + LT = "<", // less than (number) + NE = "!=", // not equal to (string, number) + GTE = ">=", // greater than or equal to (number) + LTE = "<=", // less than or equal to (number) + NIN = "nin", // Not in array (string or number) + ANY = "any", // Contains any (array of strings) + ALL = "all", // Contains all (array of strings) + TEXT_MATCH = "text_match", // full text match (allows you to search for a specific substring, token or phrase within the text field) + CONTAINS = "contains", // metadata array contains value (string or number) } export enum FilterCondition { From edd9245935b06ae52140cf869d2a07c303888c53 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 12 Jul 2024 07:48:43 +0000 Subject: [PATCH 07/17] feat: support all operators for SimpleVectorStore --- .../storage/vectorStore/SimpleVectorStore.ts | 81 ++++++++++++++----- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index b3b0997930..95e5265625 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -32,34 +32,77 @@ const MMR_MODE = VectorStoreQueryMode.MMR; type MetadataValue = Record; -const parsePrimitiveValue = (value: MetadataFilterValue): string | number => { - if (Array.isArray(value)) throw new Error("Value must be a string or number"); +const parseNumberValue = (value: MetadataFilterValue): number => { + if (typeof value !== "number") throw new Error("Value must be a number"); + return value; +}; + +const parsePrimitiveValue = (value: MetadataFilterValue): string => { + if (typeof value !== "number" && typeof value !== "string") { + throw new Error("Value must be a string or number"); + } return value.toString(); }; -const parseArrayValue = (value: MetadataFilterValue): string[] | number[] => { - if (!Array.isArray(value)) +const parseArrayValue = (value: MetadataFilterValue): string[] => { + const isPrimitiveArray = + Array.isArray(value) && + value.every((v) => typeof v === "string" || typeof v === "number"); + if (!isPrimitiveArray) { throw new Error("Value must be an array of strings or numbers"); + } return value.map(String); }; -const OPERATOR_TO_FILTER_FN: { - [key in FilterOperator]?: ( - input: MetadataFilter & { - metadata: MetadataValue; - }, +// Mapping of filter operators to metadata filter functions +const OPERATOR_TO_FILTER: { + [key in FilterOperator]: ( + { key, value }: MetadataFilter, + metadata: MetadataValue, ) => boolean; } = { - [FilterOperator.EQ]: function (input): boolean { - return ( - String(input.metadata[input.key]) === parsePrimitiveValue(input.value) + [FilterOperator.EQ]: ({ key, value }, metadata) => { + return parsePrimitiveValue(metadata[key]) === parsePrimitiveValue(value); + }, + [FilterOperator.NE]: ({ key, value }, metadata) => { + return parsePrimitiveValue(metadata[key]) !== parsePrimitiveValue(value); + }, + [FilterOperator.IN]: ({ key, value }, metadata) => { + return parseArrayValue(value).includes(parsePrimitiveValue(metadata[key])); + }, + [FilterOperator.NIN]: ({ key, value }, metadata) => { + return !parseArrayValue(value).includes(parsePrimitiveValue(metadata[key])); + }, + [FilterOperator.ANY]: ({ key, value }, metadata) => { + return parseArrayValue(value).some((v) => + parseArrayValue(metadata[key]).includes(v), ); }, - [FilterOperator.IN]: function (input): boolean { - return !!parseArrayValue(input.value).find( - (i) => i === String(input.metadata[input.key]), + [FilterOperator.ALL]: ({ key, value }, metadata) => { + return parseArrayValue(value).every((v) => + parseArrayValue(metadata[key]).includes(v), ); }, + [FilterOperator.TEXT_MATCH]: ({ key, value }, metadata) => { + return parsePrimitiveValue(metadata[key]).includes( + parsePrimitiveValue(value), + ); + }, + [FilterOperator.CONTAINS]: ({ key, value }, metadata) => { + return parseArrayValue(metadata[key]).includes(parsePrimitiveValue(value)); + }, + [FilterOperator.GT]: ({ key, value }, metadata) => { + return parseNumberValue(metadata[key]) > parseNumberValue(value); + }, + [FilterOperator.LT]: ({ key, value }, metadata) => { + return parseNumberValue(metadata[key]) < parseNumberValue(value); + }, + [FilterOperator.GTE]: ({ key, value }, metadata) => { + return parseNumberValue(metadata[key]) >= parseNumberValue(value); + }, + [FilterOperator.LTE]: ({ key, value }, metadata) => { + return parseNumberValue(metadata[key]) <= parseNumberValue(value); + }, }; // Build a filter function based on the metadata and the preFilters @@ -74,10 +117,10 @@ const buildFilterFn = ( const queryCondition = condition || "and"; // default to and const itemFilterFn = (filter: MetadataFilter) => { - const { key, value, operator } = filter; - const metadataLookupFn = OPERATOR_TO_FILTER_FN[operator]; - if (!metadataLookupFn) throw new Error(`Unsupported operator: ${operator}`); - return metadataLookupFn({ metadata, key, value, operator }); + const metadataLookupFn = OPERATOR_TO_FILTER[filter.operator]; + if (!metadataLookupFn) + throw new Error(`Unsupported operator: ${filter.operator}`); + return metadataLookupFn(filter, metadata); }; if (queryCondition === "and") return filters.every(itemFilterFn); From d83144c68cc4bf59ab45a0319b729340b2629bde Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 12 Jul 2024 08:50:43 +0000 Subject: [PATCH 08/17] only compare string or number --- .../tests/vectorStores/SimpleVectorStore.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 0338987c16..13fb958abe 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -19,19 +19,19 @@ describe("SimpleVectorStore", () => { id_: "1", embedding: [0.1, 0.2], text: "The dog is brown", - metadata: { dogId: "1", private: true }, + metadata: { dogId: "1", private: "true" }, }), new TextNode({ id_: "2", embedding: [0.2, 0.3], text: "The dog is yellow", - metadata: { dogId: "2", private: false }, + metadata: { dogId: "2", private: "false" }, }), new TextNode({ id_: "3", embedding: [0.3, 0.1], text: "The dog is red", - metadata: { dogId: "3", private: false }, + metadata: { dogId: "3", private: "false" }, }), ]; store = new SimpleVectorStore({ @@ -41,9 +41,9 @@ describe("SimpleVectorStore", () => { textIdToRefDocId: {}, metadataDict: { // Mocking the metadataDict - "1": { dogId: "1", private: true }, - "2": { dogId: "2", private: false }, - "3": { dogId: "3", private: false }, + "1": { dogId: "1", private: "true" }, + "2": { dogId: "2", private: "false" }, + "3": { dogId: "3", private: "false" }, }, }, }); From bf26460ebbcd52fc827e7470bd86164794e43a7f Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:42:30 +0000 Subject: [PATCH 09/17] feat: add operators to Milvus --- .../storage/vectorStore/MilvusVectorStore.ts | 43 +++++++++++++------ .../storage/vectorStore/SimpleVectorStore.ts | 30 +++---------- .../src/storage/vectorStore/utils.ts | 23 ++++++++++ 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts index c96d0bc142..b8d02d927e 100644 --- a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts @@ -16,7 +16,13 @@ import { type VectorStoreQuery, type VectorStoreQueryResult, } from "./types.js"; -import { metadataDictToNode, nodeToMetadata } from "./utils.js"; +import { + metadataDictToNode, + nodeToMetadata, + parseArrayValue, + parseNumberValue, + parsePrimitiveValue, +} from "./utils.js"; function parseScalarFilters(scalarFilters: MetadataFilters): string { const condition = scalarFilters.condition ?? "and"; @@ -24,23 +30,32 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { for (const filter of scalarFilters.filters) { switch (filter.operator) { - case "==": { - if (Array.isArray(filter.value)) { - throw new Error("Operator '==' does not support array value"); - } - const filterValue = String(filter.value); - filters.push(`metadata["${filter.key}"] == "${filterValue}"`); // Eg: metadata["doc_id"] == "./data/movie_reviews.csv_95" + case "==": + case "!=": { + filters.push( + `metadata["${filter.key}"] ${filter.operator} "${parsePrimitiveValue(filter.value)}"`, + ); + break; + } + case "in": + case "nin": { + const filterValue = parseArrayValue(filter.value) + .map((v) => `"${v}"`) + .join(", "); + filters.push( + `metadata["${filter.key}"] ${filter.operator} [${filterValue}]`, + ); break; } - case "in": { - if (!Array.isArray(filter.value)) { - throw new Error("Operator 'in' requires array value"); - } - const filterValue = filter.value.map((v) => `"${v}"`).join(", "); - filters.push(`metadata["${filter.key}"] in [${filterValue}]`); // Eg: metadata["doc_id"] in ["./data/movie_reviews.csv_95"] + case "<": + case "<=": + case ">": + case ">=": { + filters.push( + `metadata["${filter.key}"] ${filter.operator} ${parseNumberValue(filter.value)}`, + ); break; } - // TODO: Add support for other operators default: throw new Error(`Operator ${filter.operator} is not supported.`); } diff --git a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts index 95e5265625..175f56787e 100644 --- a/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/SimpleVectorStore.ts @@ -15,12 +15,16 @@ import { type IEmbedModel, type MetadataFilter, type MetadataFilters, - type MetadataFilterValue, type VectorStoreNoEmbedModel, type VectorStoreQuery, type VectorStoreQueryResult, } from "./types.js"; -import { nodeToMetadata } from "./utils.js"; +import { + nodeToMetadata, + parseArrayValue, + parseNumberValue, + parsePrimitiveValue, +} from "./utils.js"; const LEARNER_MODES = new Set([ VectorStoreQueryMode.SVM, @@ -32,28 +36,6 @@ const MMR_MODE = VectorStoreQueryMode.MMR; type MetadataValue = Record; -const parseNumberValue = (value: MetadataFilterValue): number => { - if (typeof value !== "number") throw new Error("Value must be a number"); - return value; -}; - -const parsePrimitiveValue = (value: MetadataFilterValue): string => { - if (typeof value !== "number" && typeof value !== "string") { - throw new Error("Value must be a string or number"); - } - return value.toString(); -}; - -const parseArrayValue = (value: MetadataFilterValue): string[] => { - const isPrimitiveArray = - Array.isArray(value) && - value.every((v) => typeof v === "string" || typeof v === "number"); - if (!isPrimitiveArray) { - throw new Error("Value must be an array of strings or numbers"); - } - return value.map(String); -}; - // Mapping of filter operators to metadata filter functions const OPERATOR_TO_FILTER: { [key in FilterOperator]: ( diff --git a/packages/llamaindex/src/storage/vectorStore/utils.ts b/packages/llamaindex/src/storage/vectorStore/utils.ts index 1a00dee274..8bdc394744 100644 --- a/packages/llamaindex/src/storage/vectorStore/utils.ts +++ b/packages/llamaindex/src/storage/vectorStore/utils.ts @@ -1,5 +1,6 @@ import type { BaseNode, Metadata } from "@llamaindex/core/schema"; import { ObjectType, jsonToNode } from "@llamaindex/core/schema"; +import type { MetadataFilterValue } from "./types.js"; const DEFAULT_TEXT_KEY = "text"; @@ -77,3 +78,25 @@ export function metadataDictToNode( return jsonToNode(nodeObj, ObjectType.TEXT); } } + +export const parseNumberValue = (value: MetadataFilterValue): number => { + if (typeof value !== "number") throw new Error("Value must be a number"); + return value; +}; + +export const parsePrimitiveValue = (value: MetadataFilterValue): string => { + if (typeof value !== "number" && typeof value !== "string") { + throw new Error("Value must be a string or number"); + } + return value.toString(); +}; + +export const parseArrayValue = (value: MetadataFilterValue): string[] => { + const isPrimitiveArray = + Array.isArray(value) && + value.every((v) => typeof v === "string" || typeof v === "number"); + if (!isPrimitiveArray) { + throw new Error("Value must be an array of strings or numbers"); + } + return value.map(String); +}; From f9a175b00b2275f3aeeca3da5bcc5944c8208643 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 09:10:40 +0000 Subject: [PATCH 10/17] feat: loop test cases --- .../vectorStores/SimpleVectorStore.test.ts | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 13fb958abe..4977ff9146 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -4,11 +4,18 @@ import { SimpleVectorStore, TextNode, VectorStoreQueryMode, + type MetadataFilters, } from "llamaindex"; import { beforeEach, describe, expect, it, vi } from "vitest"; vi.mock("@qdrant/js-client-rest"); +type FilterTestCase = { + title: string; + filters?: MetadataFilters; + expected: number; +}; + describe("SimpleVectorStore", () => { let nodes: BaseNode[]; let store: SimpleVectorStore; @@ -49,26 +56,21 @@ describe("SimpleVectorStore", () => { }); }); - describe("[SimpleVectorStore]", () => { + describe("[SimpleVectorStore] manage nodes", () => { it("able to add nodes to store", async () => { const ids = await store.add(nodes); expect(ids).length(3); }); - it("able to query nodes without filter", async () => { - await store.add(nodes); - const result = await store.query({ - queryEmbedding: [0.1, 0.2], - similarityTopK: 3, - mode: VectorStoreQueryMode.DEFAULT, - }); - expect(result.similarities).length(3); - }); - it("able to query nodes with filter EQ", async () => { - await store.add(nodes); - const result = await store.query({ - queryEmbedding: [0.1, 0.2], - similarityTopK: 3, - mode: VectorStoreQueryMode.DEFAULT, + }); + + describe("[SimpleVectorStore] query nodes", () => { + const testcases: FilterTestCase[] = [ + { + title: "No filter", + expected: 3, + }, + { + title: "Filter EQ", filters: { filters: [ { @@ -78,15 +80,10 @@ describe("SimpleVectorStore", () => { }, ], }, - }); - expect(result.similarities).length(2); - }); - it("able to query nodes with filter IN", async () => { - await store.add(nodes); - const result = await store.query({ - queryEmbedding: [0.1, 0.2], - similarityTopK: 3, - mode: VectorStoreQueryMode.DEFAULT, + expected: 2, + }, + { + title: "Filter IN", filters: { filters: [ { @@ -96,15 +93,10 @@ describe("SimpleVectorStore", () => { }, ], }, - }); - expect(result.similarities).length(2); - }); - it("able to query nodes with filter condition OR", async () => { - await store.add(nodes); - const result = await store.query({ - queryEmbedding: [0.1, 0.2], - similarityTopK: 3, - mode: VectorStoreQueryMode.DEFAULT, + expected: 2, + }, + { + title: "Filter OR", filters: { filters: [ { @@ -120,8 +112,21 @@ describe("SimpleVectorStore", () => { ], condition: "or", }, + expected: 3, + }, + ]; + + testcases.forEach((tc) => { + it(`[${tc.title}] should return ${tc.expected} nodes`, async () => { + await store.add(nodes); + const result = await store.query({ + queryEmbedding: [0.1, 0.2], + similarityTopK: 3, + mode: VectorStoreQueryMode.DEFAULT, + filters: tc.filters, + }); + expect(result.ids).length(tc.expected); }); - expect(result.similarities).length(3); }); }); }); From 6ecc05b44f528ec3444df104e6af4bd522322bca Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:06:32 +0000 Subject: [PATCH 11/17] test: add tests for all operators --- .../vectorStores/SimpleVectorStore.test.ts | 191 +++++++++++++++++- 1 file changed, 180 insertions(+), 11 deletions(-) diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 4977ff9146..6ee5fc48c2 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -4,6 +4,7 @@ import { SimpleVectorStore, TextNode, VectorStoreQueryMode, + type Metadata, type MetadataFilters, } from "llamaindex"; import { beforeEach, describe, expect, it, vi } from "vitest"; @@ -26,19 +27,37 @@ describe("SimpleVectorStore", () => { id_: "1", embedding: [0.1, 0.2], text: "The dog is brown", - metadata: { dogId: "1", private: "true" }, + metadata: { + name: "Anakin", + dogId: "1", + private: "true", + weight: 1.2, + type: ["husky", "puppy"], + }, }), new TextNode({ id_: "2", - embedding: [0.2, 0.3], + embedding: [0.1, 0.2], text: "The dog is yellow", - metadata: { dogId: "2", private: "false" }, + metadata: { + name: "Luke", + dogId: "2", + private: "false", + weight: 2.3, + type: ["puppy"], + }, }), new TextNode({ id_: "3", - embedding: [0.3, 0.1], + embedding: [0.1, 0.2], text: "The dog is red", - metadata: { dogId: "3", private: "false" }, + metadata: { + name: "Leia", + dogId: "3", + private: "false", + weight: 3.4, + type: ["husky"], + }, }), ]; store = new SimpleVectorStore({ @@ -46,12 +65,13 @@ describe("SimpleVectorStore", () => { data: { embeddingDict: {}, textIdToRefDocId: {}, - metadataDict: { - // Mocking the metadataDict - "1": { dogId: "1", private: "true" }, - "2": { dogId: "2", private: "false" }, - "3": { dogId: "3", private: "false" }, - }, + metadataDict: nodes.reduce( + (acc, node) => { + acc[node.id_] = node.metadata; + return acc; + }, + {} as Record, + ), }, }); }); @@ -82,6 +102,71 @@ describe("SimpleVectorStore", () => { }, expected: 2, }, + { + title: "Filter NE", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "!=", + }, + ], + }, + expected: 1, + }, + { + title: "Filter GT", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: ">", + }, + ], + }, + expected: 1, + }, + { + title: "Filter GTE", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: ">=", + }, + ], + }, + expected: 2, + }, + { + title: "Filter LT", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: "<", + }, + ], + }, + expected: 1, + }, + { + title: "Filter LTE", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: "<=", + }, + ], + }, + expected: 2, + }, { title: "Filter IN", filters: { @@ -95,6 +180,71 @@ describe("SimpleVectorStore", () => { }, expected: 2, }, + { + title: "Filter NIN", + filters: { + filters: [ + { + key: "name", + value: ["Anakin", "Leia"], + operator: "nin", + }, + ], + }, + expected: 1, + }, + { + title: "Filter ANY", + filters: { + filters: [ + { + key: "type", + value: ["husky", "puppy"], + operator: "any", + }, + ], + }, + expected: 3, + }, + { + title: "Filter ALL", + filters: { + filters: [ + { + key: "type", + value: ["husky", "puppy"], + operator: "all", + }, + ], + }, + expected: 1, + }, + { + title: "Filter CONTAINS", + filters: { + filters: [ + { + key: "type", + value: "puppy", + operator: "contains", + }, + ], + }, + expected: 2, + }, + { + title: "Filter TEXT_MATCH", + filters: { + filters: [ + { + key: "name", + value: "Luk", + operator: "text_match", + }, + ], + }, + expected: 1, + }, { title: "Filter OR", filters: { @@ -114,6 +264,25 @@ describe("SimpleVectorStore", () => { }, expected: 3, }, + { + title: "Filter OR", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + { + key: "dogId", + value: "10", + operator: "==", + }, + ], + condition: "and", + }, + expected: 0, + }, ]; testcases.forEach((tc) => { From 7898323935216e70277ce4579aeb9946c8b2ce28 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:30:37 +0000 Subject: [PATCH 12/17] fix: typo --- examples/milvus/query.ts | 44 ++++--------------- .../vectorStores/SimpleVectorStore.test.ts | 2 +- 2 files changed, 10 insertions(+), 36 deletions(-) diff --git a/examples/milvus/query.ts b/examples/milvus/query.ts index d4f308f514..ff33fc69a7 100644 --- a/examples/milvus/query.ts +++ b/examples/milvus/query.ts @@ -5,44 +5,18 @@ const collectionName = "movie_reviews"; async function main() { try { const milvus = new MilvusVectorStore({ collection: collectionName }); + const index = await VectorStoreIndex.fromVectorStore(milvus); - const retriever = index.asRetriever({ similarityTopK: 20 }); - console.log("=====\nQuerying the index without any filters."); - const queryEngineNoFilters = index.asQueryEngine({ retriever }); - const resultNoFilter = await queryEngineNoFilters.query({ - query: "Summary movie reviews", - }); - console.log(`Query from ${resultNoFilter.sourceNodes?.length} nodes`); - console.log(resultNoFilter.response); - - console.log("\n=====\nQuerying the index with filters"); - const queryEngineWithFilters = index.asQueryEngine({ - retriever, - preFilters: { - filters: [ - { - key: "doc_id", - value: [ - "./data/movie_reviews.csv_95", - "./data/movie_reviews.csv_101", - ], - operator: "in", - }, - { - key: "document_id", - value: "./data/movie_reviews.csv_37", - operator: "==", - }, - ], - condition: "or", - }, - }); - const resultAfterFilter = await queryEngineWithFilters.query({ - query: "Summary movie reviews", + const retriever = await index.asRetriever({ similarityTopK: 20 }); + + const queryEngine = await index.asQueryEngine({ retriever }); + + const results = await queryEngine.query({ + query: "What is the best reviewed movie?", }); - console.log(`Query from ${resultAfterFilter.sourceNodes?.length} nodes`); - console.log(resultAfterFilter.response); + + console.log(results.response); } catch (e) { console.error(e); } diff --git a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts index 6ee5fc48c2..11423f0382 100644 --- a/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/SimpleVectorStore.test.ts @@ -265,7 +265,7 @@ describe("SimpleVectorStore", () => { expected: 3, }, { - title: "Filter OR", + title: "Filter AND", filters: { filters: [ { From a75df30cd1cfc59160cc4a783097ed8c28e4b44e Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:44:13 +0000 Subject: [PATCH 13/17] fix: nin not support and need to replace by joining != --- .../src/storage/vectorStore/MilvusVectorStore.ts | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts index b8d02d927e..92319c81f5 100644 --- a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts @@ -37,8 +37,7 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { ); break; } - case "in": - case "nin": { + case "in": { const filterValue = parseArrayValue(filter.value) .map((v) => `"${v}"`) .join(", "); @@ -47,6 +46,15 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { ); break; } + case "nin": { + // Milmus does not support `nin` operator, so we need to manually check every value + // Expected: not metadata["key"] != "value1" and not metadata["key"] != "value2" + const filterStr = parseArrayValue(filter.value) + .map((v) => `metadata["${filter.key}"] != "${v}"`) + .join(" && "); + filters.push(filterStr); + break; + } case "<": case "<=": case ">": @@ -61,6 +69,8 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { } } + console.log({ filterStr: filters.join(` ${condition} `) }); + return filters.join(` ${condition} `); } From 31bf7de9af4be5a034fafabcc9bf5cc42aa48787 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:44:27 +0000 Subject: [PATCH 14/17] test: update example for filtering milvus --- examples/metadata-filter/milvus.ts | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 examples/metadata-filter/milvus.ts diff --git a/examples/metadata-filter/milvus.ts b/examples/metadata-filter/milvus.ts new file mode 100644 index 0000000000..9415bca57f --- /dev/null +++ b/examples/metadata-filter/milvus.ts @@ -0,0 +1,40 @@ +import { MilvusVectorStore, VectorStoreIndex } from "llamaindex"; + +const collectionName = "movie_reviews"; + +async function main() { + try { + const milvus = new MilvusVectorStore({ collection: collectionName }); + const index = await VectorStoreIndex.fromVectorStore(milvus); + const retriever = index.asRetriever({ similarityTopK: 20 }); + + console.log("\n=====\nQuerying the index with filters"); + const queryEngineWithFilters = index.asQueryEngine({ + retriever, + preFilters: { + filters: [ + { + key: "document_id", + value: "./data/movie_reviews.csv_37", + operator: "==", + }, + { + key: "document_id", + value: "./data/movie_reviews.csv_37", + operator: "!=", + }, + ], + condition: "or", + }, + }); + const resultAfterFilter = await queryEngineWithFilters.query({ + query: "Get all movie titles.", + }); + console.log(`Query from ${resultAfterFilter.sourceNodes?.length} nodes`); + console.log(resultAfterFilter.response); + } catch (e) { + console.error(e); + } +} + +void main(); From 7fbf75260c842f8e916ba29b2c98b5c149131419 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:51:52 +0000 Subject: [PATCH 15/17] feat: test filter milvus and mock test query --- .../storage/vectorStore/MilvusVectorStore.ts | 4 +- .../vectorStores/MilvusVectorStore.test.ts | 355 ++++++++++++++++++ 2 files changed, 356 insertions(+), 3 deletions(-) create mode 100644 packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts diff --git a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts index 92319c81f5..9cfd5d9660 100644 --- a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts @@ -69,8 +69,6 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { } } - console.log({ filterStr: filters.join(` ${condition} `) }); - return filters.join(` ${condition} `); } @@ -240,7 +238,7 @@ export class MilvusVectorStore }); } - private toMilvusFilter(filters?: MetadataFilters): string | undefined { + public toMilvusFilter(filters?: MetadataFilters): string | undefined { if (!filters) return undefined; // TODO: Milvus also support standard filters, we can add it later return parseScalarFilters(filters); diff --git a/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts new file mode 100644 index 0000000000..2c70fdc76a --- /dev/null +++ b/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts @@ -0,0 +1,355 @@ +import type { BaseNode } from "@llamaindex/core/schema"; +import { TextNode } from "@llamaindex/core/schema"; +import type { MilvusClient } from "@zilliz/milvus2-sdk-node"; +import { + MilvusVectorStore, + VectorStoreQueryMode, + type MetadataFilters, +} from "llamaindex"; +import { beforeEach, describe, expect, it, vi, type Mocked } from "vitest"; + +vi.mock("@qdrant/js-client-rest"); + +type FilterTestCase = { + title: string; + filters?: MetadataFilters; + expected: number; + expectedFilterStr: string | undefined; + mockResultIds: string[]; +}; + +export class TestableMilvusVectorStore extends MilvusVectorStore { + public nodes: BaseNode[] = []; + + private fakeTimeout = (ms: number) => { + return new Promise((resolve) => setTimeout(resolve, ms)); + }; + + public async add(nodes: BaseNode[]): Promise { + this.nodes.push(...nodes); + await this.fakeTimeout(100); + return nodes.map((node) => node.id_); + } + + constructor() { + super({ + milvusClient: {} as Mocked, + }); + } +} + +describe("MilvusVectorStore", () => { + let store: MilvusVectorStore; + let nodes: BaseNode[]; + + beforeEach(() => { + store = new TestableMilvusVectorStore(); + nodes = [ + new TextNode({ + id_: "1", + embedding: [0.1, 0.2], + text: "The dog is brown", + metadata: { + name: "Anakin", + dogId: "1", + private: "true", + weight: 1.2, + type: ["husky", "puppy"], + }, + }), + new TextNode({ + id_: "2", + embedding: [0.1, 0.2], + text: "The dog is yellow", + metadata: { + name: "Luke", + dogId: "2", + private: "false", + weight: 2.3, + type: ["puppy"], + }, + }), + new TextNode({ + id_: "3", + embedding: [0.1, 0.2], + text: "The dog is red", + metadata: { + name: "Leia", + dogId: "3", + private: "false", + weight: 3.4, + type: ["husky"], + }, + }), + ]; + }); + + describe("[MilvusVectorStore] manage nodes", () => { + it("able to add nodes to store", async () => { + const ids = await store.add(nodes); + expect(ids).length(3); + }); + }); + + describe("[MilvusVectorStore] filter nodes with supported operators", () => { + const testcases: FilterTestCase[] = [ + { + title: "No filter", + expected: 3, + mockResultIds: ["1", "2", "3"], + expectedFilterStr: undefined, + }, + { + title: "Filter EQ", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + ], + }, + expected: 2, + mockResultIds: ["2", "3"], + expectedFilterStr: 'metadata["private"] == "false"', + }, + { + title: "Filter NE", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "!=", + }, + ], + }, + expected: 1, + mockResultIds: ["1"], + expectedFilterStr: 'metadata["private"] != "false"', + }, + { + title: "Filter GT", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: ">", + }, + ], + }, + expected: 1, + mockResultIds: ["3"], + expectedFilterStr: 'metadata["weight"] > 2.3', + }, + { + title: "Filter GTE", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: ">=", + }, + ], + }, + expected: 2, + mockResultIds: ["2", "3"], + expectedFilterStr: 'metadata["weight"] >= 2.3', + }, + { + title: "Filter LT", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: "<", + }, + ], + }, + expected: 1, + mockResultIds: ["1"], + expectedFilterStr: 'metadata["weight"] < 2.3', + }, + { + title: "Filter LTE", + filters: { + filters: [ + { + key: "weight", + value: 2.3, + operator: "<=", + }, + ], + }, + expected: 2, + mockResultIds: ["1", "2"], + expectedFilterStr: 'metadata["weight"] <= 2.3', + }, + { + title: "Filter IN", + filters: { + filters: [ + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + }, + expected: 2, + mockResultIds: ["1", "3"], + expectedFilterStr: 'metadata["dogId"] in ["1", "3"]', + }, + { + title: "Filter NIN", + filters: { + filters: [ + { + key: "name", + value: ["Anakin", "Leia"], + operator: "nin", + }, + ], + }, + expected: 1, + mockResultIds: ["2"], + expectedFilterStr: + 'metadata["name"] != "Anakin" && metadata["name"] != "Leia"', + }, + { + title: "Filter OR", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + { + key: "dogId", + value: ["1", "3"], + operator: "in", + }, + ], + condition: "or", + }, + expected: 3, + mockResultIds: ["1", "2", "3"], + expectedFilterStr: + 'metadata["private"] == "false" or metadata["dogId"] in ["1", "3"]', + }, + { + title: "Filter AND", + filters: { + filters: [ + { + key: "private", + value: "false", + operator: "==", + }, + { + key: "dogId", + value: "10", + operator: "==", + }, + ], + condition: "and", + }, + expected: 0, + mockResultIds: [], + expectedFilterStr: + 'metadata["private"] == "false" and metadata["dogId"] == "10"', + }, + ]; + + testcases.forEach((tc) => { + it(`[${tc.title}] should return ${tc.expected} nodes`, async () => { + expect(store.toMilvusFilter(tc.filters)).toBe(tc.expectedFilterStr); + + vi.spyOn(store, "query").mockResolvedValue({ + ids: tc.mockResultIds, + similarities: [0.1, 0.2, 0.3], + }); + + await store.add(nodes); + const result = await store.query({ + queryEmbedding: [0.1, 0.2], + similarityTopK: 3, + mode: VectorStoreQueryMode.DEFAULT, + filters: tc.filters, + }); + expect(result.ids).length(tc.expected); + }); + }); + }); + + describe("[MilvusVectorStore] filter nodes with unsupported operators", () => { + const testcases: Array< + Omit + > = [ + { + title: "Filter ANY", + filters: { + filters: [ + { + key: "type", + value: ["husky", "puppy"], + operator: "any", + }, + ], + }, + expected: 3, + }, + { + title: "Filter ALL", + filters: { + filters: [ + { + key: "type", + value: ["husky", "puppy"], + operator: "all", + }, + ], + }, + expected: 1, + }, + { + title: "Filter CONTAINS", + filters: { + filters: [ + { + key: "type", + value: "puppy", + operator: "contains", + }, + ], + }, + expected: 2, + }, + { + title: "Filter TEXT_MATCH", + filters: { + filters: [ + { + key: "name", + value: "Luk", + operator: "text_match", + }, + ], + }, + expected: 1, + }, + ]; + + testcases.forEach((tc) => { + it(`[Unsupported Operator] [${tc.title}] should throw error`, async () => { + const errorMsg = `Operator ${tc.filters?.filters[0].operator} is not supported.`; + expect(() => store.toMilvusFilter(tc.filters)).toThrow(errorMsg); + }); + }); + }); +}); From 498d3bb94db6d92cd7518c12a3eaf500daa7dc48 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:57:19 +0000 Subject: [PATCH 16/17] refactor: move testable class to mock --- .../tests/mocks/TestableMilvusVectorStore.ts | 24 +++++++++++++++++ .../vectorStores/MilvusVectorStore.test.ts | 26 ++----------------- 2 files changed, 26 insertions(+), 24 deletions(-) create mode 100644 packages/llamaindex/tests/mocks/TestableMilvusVectorStore.ts diff --git a/packages/llamaindex/tests/mocks/TestableMilvusVectorStore.ts b/packages/llamaindex/tests/mocks/TestableMilvusVectorStore.ts new file mode 100644 index 0000000000..f5665c7af9 --- /dev/null +++ b/packages/llamaindex/tests/mocks/TestableMilvusVectorStore.ts @@ -0,0 +1,24 @@ +import type { BaseNode } from "@llamaindex/core/schema"; +import type { MilvusClient } from "@zilliz/milvus2-sdk-node"; +import { MilvusVectorStore } from "llamaindex"; +import { type Mocked } from "vitest"; + +export class TestableMilvusVectorStore extends MilvusVectorStore { + public nodes: BaseNode[] = []; + + private fakeTimeout = (ms: number) => { + return new Promise((resolve) => setTimeout(resolve, ms)); + }; + + public async add(nodes: BaseNode[]): Promise { + this.nodes.push(...nodes); + await this.fakeTimeout(100); + return nodes.map((node) => node.id_); + } + + constructor() { + super({ + milvusClient: {} as Mocked, + }); + } +} diff --git a/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts b/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts index 2c70fdc76a..7c2c5e50a9 100644 --- a/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts +++ b/packages/llamaindex/tests/vectorStores/MilvusVectorStore.test.ts @@ -1,14 +1,12 @@ import type { BaseNode } from "@llamaindex/core/schema"; import { TextNode } from "@llamaindex/core/schema"; -import type { MilvusClient } from "@zilliz/milvus2-sdk-node"; import { MilvusVectorStore, VectorStoreQueryMode, type MetadataFilters, } from "llamaindex"; -import { beforeEach, describe, expect, it, vi, type Mocked } from "vitest"; - -vi.mock("@qdrant/js-client-rest"); +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { TestableMilvusVectorStore } from "../mocks/TestableMilvusVectorStore.js"; type FilterTestCase = { title: string; @@ -18,26 +16,6 @@ type FilterTestCase = { mockResultIds: string[]; }; -export class TestableMilvusVectorStore extends MilvusVectorStore { - public nodes: BaseNode[] = []; - - private fakeTimeout = (ms: number) => { - return new Promise((resolve) => setTimeout(resolve, ms)); - }; - - public async add(nodes: BaseNode[]): Promise { - this.nodes.push(...nodes); - await this.fakeTimeout(100); - return nodes.map((node) => node.id_); - } - - constructor() { - super({ - milvusClient: {} as Mocked, - }); - } -} - describe("MilvusVectorStore", () => { let store: MilvusVectorStore; let nodes: BaseNode[]; From 03b0bae89b36a66fc79480a9f8adb26b3b8e4cd5 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Wed, 17 Jul 2024 06:40:33 +0000 Subject: [PATCH 17/17] fix: typo --- .../llamaindex/src/storage/vectorStore/MilvusVectorStore.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts index 9cfd5d9660..15c3fe6e9f 100644 --- a/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts +++ b/packages/llamaindex/src/storage/vectorStore/MilvusVectorStore.ts @@ -47,7 +47,7 @@ function parseScalarFilters(scalarFilters: MetadataFilters): string { break; } case "nin": { - // Milmus does not support `nin` operator, so we need to manually check every value + // Milvus does not support `nin` operator, so we need to manually check every value // Expected: not metadata["key"] != "value1" and not metadata["key"] != "value2" const filterStr = parseArrayValue(filter.value) .map((v) => `metadata["${filter.key}"] != "${v}"`)