diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt index 0ce7d7a2..15d8ea36 100644 --- a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt +++ b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt @@ -3,19 +3,24 @@ package cc.unitmesh.rag.document import java.io.InputStream /** - * > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。 + * The DocumentParser interface is responsible for parsing different types of documents. * - * Parse the given input stream and return a list of documents. + * This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j). * - * 返回多个 [Document]: - * - [cc.unitmesh.rag.document.DocumentType.PPT] + * The DocumentParser interface returns multiple types of documents: + * - [cc.unitmesh.rag.document.DocumentType.PPT] * - * 返回单个 [Document]: - * - [cc.unitmesh.rag.document.DocumentType.PDF] - * - [cc.unitmesh.rag.document.DocumentType.TXT] - * - [cc.unitmesh.rag.document.DocumentType.HTML] - * - [cc.unitmesh.rag.document.DocumentType.DOC] + * The DocumentParser interface also returns single types of documents: + * - [cc.unitmesh.rag.document.DocumentType.PDF] + * - [cc.unitmesh.rag.document.DocumentType.DOC] + * - [cc.unitmesh.rag.document.DocumentType.XLS] + * - [cc.unitmesh.rag.document.DocumentType.MD] + * - [cc.unitmesh.rag.document.DocumentType.HTML] + * - [cc.unitmesh.rag.document.DocumentType.TXT] * + * The DocumentParser interface provides a method to parse the input stream of a document. + * + * @property DOCUMENT_TYPE The constant value representing the document type. */ interface DocumentParser { fun parse(inputStream: InputStream): List diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt index 56698661..5913dfbe 100644 --- a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt +++ b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt @@ -20,16 +20,70 @@ package cc.unitmesh.rag.store import cc.unitmesh.nlp.embedding.Embedding +/** + * 向量数据库的核心是将数据表示为向量,并使用向量空间中的距离度量来进行数据的存储、检索和分析。 + * + * An interface for an Embedding Store, which is a vector database used to store and manage embeddings. + * Embeddings are high-dimensional vector representations of data points, which can be used in various + * machine learning and data retrieval tasks. + * + * @param Embedded The type of data embedded in the store. + */ interface EmbeddingStore { + /** + * Adds an embedding to the store and returns its unique identifier. + * + * @param embedding The embedding to be added. + * @return A unique identifier associated with the added embedding. + */ fun add(embedding: Embedding): String + + /** + * Adds an embedding to the store with a specified identifier. + * + * @param id The unique identifier for the embedding. + * @param embedding The embedding to be added. + */ fun add(id: String, embedding: Embedding) + + /** + * Adds an embedding to the store and associates it with the provided embedded data. + * + * @param embedding The embedding to be added. + * @param embedded The data embedded in the store. + * @return A unique identifier associated with the added embedding. + */ fun add(embedding: Embedding, embedded: Embedded): String + + /** + * Adds a list of embeddings to the store and returns a list of unique identifiers. + * + * @param embeddings The list of embeddings to be added. + * @return A list of unique identifiers associated with the added embeddings, in the same order. + */ fun addAll(embeddings: List): List - fun addAll(embeddings: List, embedded: List): List - fun findRelevant(referenceEmbedding: Embedding, maxResults: Int): List> { - return findRelevant(referenceEmbedding, maxResults, 0.0) - } + /** + * Adds a list of embeddings to the store and associates them with a list of embedded data. + * + * @param embeddings The list of embeddings to be added. + * @param embedded The list of data embedded in the store. + * @return A list of unique identifiers associated with the added embeddings, in the same order. + */ + fun addAll(embeddings: List, embedded: List): List - fun findRelevant(referenceEmbedding: Embedding, maxResults: Int, minScore: Double): List> -} \ No newline at end of file + /** + * Find relevant embeddings in the store based on a reference embedding, with a maximum number of results. + * An optional minimum score can be specified to filter results. + * + * @param referenceEmbedding The reference embedding to compare against. + * @param maxResults The maximum number of results to retrieve. + * @param minScore The minimum similarity score required to include a result (default is 0.0). + * @return A list of [EmbeddingMatch] objects representing relevant matches. + */ + fun findRelevant( + referenceEmbedding: Embedding, + maxResults: Int, + minScore: Double = 0.0, + ): List> +} diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt index 5a761e0d..aa71b912 100644 --- a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt +++ b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt @@ -12,6 +12,13 @@ data class Entry( var embedded: Embedded?, ) +/** + * The `InMemoryEmbeddingStore` class is an implementation of the `EmbeddingStore` interface that stores embeddings in memory. + * It provides methods to add embeddings, retrieve relevant embeddings, and manage the storage of embeddings. + * + * @param Embedded the type of the embedded object associated with each embedding + * + */ class InMemoryEmbeddingStore : EmbeddingStore { private val entries: MutableList> = ArrayList() override fun add(embedding: Embedding): String { diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt index 1de1729b..98add21a 100644 --- a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt +++ b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt @@ -8,6 +8,25 @@ import java.util.* /** * A simple in-memory English implementation of [EmbeddingStore]. + * + * This class represents an in-memory storage for English text embeddings. It implements the [EmbeddingStore] interface, + * which provides methods for adding and retrieving embeddings. + * + * The class stores the embeddings in a mutable list of [Entry] objects. Each entry contains an ID, an embedding, and an + * optional embedded object. The ID is generated using the [IdUtil.uuid] method. The class provides multiple overloaded + * methods for adding embeddings, allowing the user to specify the ID and the embedded object. + * + * The class also provides methods for adding multiple embeddings at once. The [addAll] method takes a list of embeddings + * and adds them to the store, returning a list of IDs for the added embeddings. There is also an overloaded version of + * [addAll] that takes a list of embeddings and a list of embedded objects, ensuring that both lists have the same size. + * + * The [findRelevant] method allows the user to find the most relevant embeddings in the store based on a reference + * embedding. It takes the reference embedding, the maximum number of results to return, and the minimum relevance score + * as parameters. It calculates the cosine similarity between the reference embedding and each entry in the store, and + * filters the entries based on the minimum score. The method returns a list of [EmbeddingMatch] objects, sorted by + * relevance score in descending order. + * + * @param Embedded the type of the embedded object associated with each embedding */ class InMemoryEnglishTextStore : EmbeddingStore { private val entries: MutableList> = ArrayList() diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt deleted file mode 100644 index ff08f5df..00000000 --- a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt +++ /dev/null @@ -1,16 +0,0 @@ -package cc.unitmesh.rag.store - -import cc.unitmesh.nlp.embedding.Embedding -import cc.unitmesh.nlp.similarity.Similarity -import java.util.* - - -interface VectorStore { - val similarity: Similarity - fun addAll(documents: List) - fun add(document: Embedded) - - fun add(id: String, embedding: Embedding) - - fun findRelevant(referenceEmbedding: Embedding, maxResults: Int, minSimilarity: Double): List -} diff --git a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt index 620ff9a5..b43b8808 100644 --- a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt +++ b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt @@ -1,5 +1,6 @@ package cc.unitmesh.rag.store; +import cc.unitmesh.docs.SampleCode import cc.unitmesh.nlp.embedding.Embedding import cc.unitmesh.nlp.embedding.toEmbedding import cc.unitmesh.rag.document.Document @@ -7,8 +8,24 @@ import org.assertj.core.api.Assertions import org.junit.jupiter.api.Test class InMemoryEmbeddingStoreTest { + @Test + @SampleCode + fun it_works() { + // start-sample + val embeddingStore: EmbeddingStore = InMemoryEmbeddingStore() + + embeddingStore.add(toEmbedding(floatArrayOf(1f, 3f)), Document.from("first")) + embeddingStore.add(toEmbedding(floatArrayOf(2f, 2f)), Document.from("second")) + + val relevant: List> = + embeddingStore.findRelevant(toEmbedding(floatArrayOf(4f, 0f)), 2) + + // end-sample + + } @Test + @SampleCode(name = "文本嵌入示例", content = "") fun should_add_embedding_with_generated_id() { val embeddingStore: EmbeddingStore = InMemoryEmbeddingStore() diff --git a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt index 4478fce8..e9702942 100644 --- a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt +++ b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt @@ -1,5 +1,6 @@ package cc.unitmesh.rag.store; +import cc.unitmesh.docs.SampleCode import cc.unitmesh.nlp.embedding.text.EnglishTextEmbeddingProvider import org.junit.jupiter.api.Assertions.* import org.junit.jupiter.api.Test @@ -7,8 +8,9 @@ import org.junit.jupiter.api.Test class InMemoryEnglishTextStoreTest { private val provider = EnglishTextEmbeddingProvider() @Test + @SampleCode fun should_find_relevant_embeddings() { - // given + // start-sample val store = InMemoryEnglishTextStore() store.add(provider.embed("this is a example"), "this is a example") @@ -19,6 +21,7 @@ class InMemoryEnglishTextStoreTest { val minScore = 0.5 val embedding4 = provider.embed("this is a cat") + // end-sample // when val relevantEmbeddings = store.findRelevant(embedding4, maxResults, minScore) diff --git a/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt b/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt index 4291d3ea..b44d5325 100644 --- a/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt +++ b/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt @@ -20,12 +20,14 @@ class KDocGen(private val rootDir: Path) : DocGenerator() { private var fileNodes = listOf() override fun execute(): List { - fileNodes = fileNodes + processor.process(rootDir) + fileNodes += processor.process(rootDir) return extractNodes(fileNodes) } - fun appendNodes(dir: Path) { - fileNodes = fileNodes + processor.process(dir) + fun appendNodes(vararg dirs: Path) { + dirs.forEach { + fileNodes += processor.process(it) + } } fun extractNodes(fileASTNodes: List): List { diff --git a/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt b/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt index 71142801..0a7f88c0 100644 --- a/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt +++ b/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt @@ -16,6 +16,7 @@ class Runner : CliktCommand() { processRagScript(rootDir) processPromptScript(rootDir) processDocumentModule(rootDir) + processVectorStoreModule(rootDir) } private val warningLog = @@ -89,6 +90,35 @@ class Runner : CliktCommand() { } } + private fun processVectorStoreModule(rootDir: Path) { + val documentDir = rootDir.resolve("cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/") + val kDocGen = KDocGen(documentDir) + + kDocGen.appendNodes( + rootDir.resolve("cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/"), + rootDir.resolve("rag-modules/store-elasticsearch"), + rootDir.resolve("rag-modules/store-milvus"), + rootDir.resolve("rag-modules/store-pinecone") + ) + + val documentDocs = kDocGen.execute() + + val docs = renderDocs(documentDocs) + val outputDir = rootDir.resolve("docs/rag/") + var index = 11 + docs.forEach { (name, content) -> + var output = + CustomJekyllFrontMatter("Vector Store", "Retrieval Augmented Generation", index, "/rag/vector-store") + .toMarkdown() + + output = "$output$warningLog" + + val outputFile = outputDir.resolve("vector-store.md") + outputFile.toFile().writeText(output + "\n\n" + content) + index += 1 + } + } + private fun renderDocs(rootDocContents: List): Map { return rootDocContents.associate { treeDoc -> diff --git a/docs/rag-script/workflow.md b/docs/rag-script/workflow.md index 6dcba9a4..1e95c186 100644 --- a/docs/rag-script/workflow.md +++ b/docs/rag-script/workflow.md @@ -171,8 +171,7 @@ indexing { ## querying -Querying is a function block for querying data for the workflow. You don't need to call it as block. - +querying is a function block for querying data for the workflow. you don't need to call it as block. for example: ```kotlin querying { diff --git a/docs/rag/document.md b/docs/rag/document.md index 91a24168..7b2436fb 100644 --- a/docs/rag/document.md +++ b/docs/rag/document.md @@ -11,18 +11,23 @@ Automatically generated documentation; use the command `./gradlew :docs-builder: # DocumentParser -> > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。 +> The DocumentParser interface is responsible for parsing different types of documents. -Parse the given input stream and return a list of documents. +This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j). -返回多个 [Document]: +The DocumentParser interface returns multiple types of documents: - [cc.unitmesh.rag.document.DocumentType.PPT] -返回单个 [Document]: +The DocumentParser interface also returns single types of documents: - [cc.unitmesh.rag.document.DocumentType.PDF] -- [cc.unitmesh.rag.document.DocumentType.TXT] -- [cc.unitmesh.rag.document.DocumentType.HTML] - [cc.unitmesh.rag.document.DocumentType.DOC] +- [cc.unitmesh.rag.document.DocumentType.XLS] +- [cc.unitmesh.rag.document.DocumentType.MD] +- [cc.unitmesh.rag.document.DocumentType.HTML] +- [cc.unitmesh.rag.document.DocumentType.TXT] + +The DocumentParser interface provides a method to parse the input stream of a document. + ## MdDocumentParser diff --git a/docs/rag/vector-store.md b/docs/rag/vector-store.md index ba42bb5e..369b3e33 100644 --- a/docs/rag/vector-store.md +++ b/docs/rag/vector-store.md @@ -2,51 +2,93 @@ layout: default title: Vector Store parent: Retrieval Augmented Generation -nav_order: 3 +nav_order: 11 +permalink: /rag/vector-store --- -向量数据库的核心在于相似性搜索(Similarity Search),即在向量空间中搜索与查询向量最相似的向量。 +{: .warning } +Automatically generated documentation; use the command `./gradlew :docs-builder:run` and update comments in the source code to reflect changes. -## InMemoryEmbeddingStore +# EmbeddingStore + +> 向量数据库的核心是将数据表示为向量,并使用向量空间中的距离度量来进行数据的存储、检索和分析。 + +An interface for an Embedding Store, which is a vector database used to store and manage embeddings. +Embeddings are high-dimensional vector representations of data points, which can be used in various +machine learning and data retrieval tasks. + + + +## ElasticsearchStore + +ElasticsearchStore is an implementation of the EmbeddingStore interface that uses Elasticsearch as the underlying storage. +It allows storing and retrieving embeddings along with associated documents. + +The ElasticsearchStore class requires the following parameters to be provided: +- serverUrl: The URL of the Elasticsearch server. The default value is "http://localhost:9200". +- indexName: The name of the Elasticsearch index to use. The default value is "chocolate-code". +- username: The username for authentication with the Elasticsearch server. This parameter is optional. +- password: The password for authentication with the Elasticsearch server. This parameter is optional. +- apiKey: The API key for authentication with the Elasticsearch server. This parameter is optional. + +The ElasticsearchStore class provides methods for adding embeddings and documents, as well as retrieving relevant embeddings based on a reference embedding. ```kotlin -// 1. initialize the vector store -val vectorStore: EmbeddingStore = InMemoryEmbeddingStore() -// 2. add the embeddings -vectorStore.addAll(embeddings, documentList) - -// 3. retrieves the similar documents -val vectorStoreRetriever = EmbeddingStoreRetriever(vectorStore) -val similarDocuments: List> = vectorStoreRetriever.retrieve(userQuery) +val store: ElasticsearchStore = ElasticsearchStore(elasticsearchUrl) ``` -## Elasticsearch +## InMemoryEmbeddingStore -Elasticsearch 可以提供向量搜索,以及普通的文本搜索,可以作为代码搜索的后端场景。 +The `InMemoryEmbeddingStore` class is an implementation of the `EmbeddingStore` interface that stores embeddings in memory. +It provides methods to add embeddings, retrieve relevant embeddings, and manage the storage of embeddings. -实现类:ElasticsearchStore,基于 LangChain4j 的代码,实现了一个 Elasticsearch 的存储引擎。 +Sample: ```kotlin -@Throws(JsonProcessingException::class) -private fun buildDefaultScriptScoreQuery(vector: Embedding, minScore: Float): ScriptScoreQuery { - val queryVector = toJsonData(vector) - return ScriptScoreQuery.of { q: ScriptScoreQuery.Builder -> - q - .minScore(minScore) - .query(Query.of { qu: Query.Builder -> qu.matchAll { m: MatchAllQuery.Builder? -> m } }) - .script { s: Script.Builder -> - s.inline(InlineScript.of { i: InlineScript.Builder -> - i // The script adds 1.0 to the cosine similarity to prevent the score from being negative. - // divided by 2 to keep the score in the range [0, 1] - .source("(cosineSimilarity(params.query_vector, 'vector') + 1.0) / 2") - .params("query_vector", queryVector) - }) - } - } -} +val embeddingStore: EmbeddingStore = InMemoryEmbeddingStore() + +embeddingStore.add(toEmbedding(floatArrayOf(1f, 3f)), Document.from("first")) +embeddingStore.add(toEmbedding(floatArrayOf(2f, 2f)), Document.from("second")) + +val relevant: List> = + embeddingStore.findRelevant(toEmbedding(floatArrayOf(4f, 0f)), 2) ``` -## pgvector (TODO) +## InMemoryEnglishTextStore + +A simple in-memory English implementation of [EmbeddingStore]. + +This class represents an in-memory storage for English text embeddings. It implements the [EmbeddingStore] interface, +which provides methods for adding and retrieving embeddings. -> Open-source vector similarity search for Postgres +The class stores the embeddings in a mutable list of [Entry] objects. Each entry contains an ID, an embedding, and an +optional embedded object. The ID is generated using the [IdUtil.uuid] method. The class provides multiple overloaded +methods for adding embeddings, allowing the user to specify the ID and the embedded object. + +The class also provides methods for adding multiple embeddings at once. The [addAll] method takes a list of embeddings +and adds them to the store, returning a list of IDs for the added embeddings. There is also an overloaded version of +[addAll] that takes a list of embeddings and a list of embedded objects, ensuring that both lists have the same size. + +The [findRelevant] method allows the user to find the most relevant embeddings in the store based on a reference +embedding. It takes the reference embedding, the maximum number of results to return, and the minimum relevance score +as parameters. It calculates the cosine similarity between the reference embedding and each entry in the store, and +filters the entries based on the minimum score. The method returns a list of [EmbeddingMatch] objects, sorted by +relevance score in descending order. + + + +Sample: + +```kotlin +val store = InMemoryEnglishTextStore() + +store.add(provider.embed("this is a example"), "this is a example") +store.add(provider.embed("this is a dog"), "this is a dog") +store.add(provider.embed("this is item list"), "this is item list") + +val maxResults = 1 +val minScore = 0.5 + +val embedding4 = provider.embed("this is a cat") +``` diff --git a/rag-modules/store-elasticsearch/src/main/kotlin/cc/unitmesh/store/ElasticsearchStore.kt b/rag-modules/store-elasticsearch/src/main/kotlin/cc/unitmesh/store/ElasticsearchStore.kt index 3c063be8..186b5f48 100644 --- a/rag-modules/store-elasticsearch/src/main/kotlin/cc/unitmesh/store/ElasticsearchStore.kt +++ b/rag-modules/store-elasticsearch/src/main/kotlin/cc/unitmesh/store/ElasticsearchStore.kt @@ -41,7 +41,21 @@ import java.io.IOException import java.util.* /** - * Elastic Embedding Store Implementation + * ElasticsearchStore is an implementation of the EmbeddingStore interface that uses Elasticsearch as the underlying storage. + * It allows storing and retrieving embeddings along with associated documents. + * + * The ElasticsearchStore class requires the following parameters to be provided: + * - serverUrl: The URL of the Elasticsearch server. The default value is "http://localhost:9200". + * - indexName: The name of the Elasticsearch index to use. The default value is "chocolate-code". + * - username: The username for authentication with the Elasticsearch server. This parameter is optional. + * - password: The password for authentication with the Elasticsearch server. This parameter is optional. + * - apiKey: The API key for authentication with the Elasticsearch server. This parameter is optional. + * + * The ElasticsearchStore class provides methods for adding embeddings and documents, as well as retrieving relevant embeddings based on a reference embedding. + * + * ```kotlin + * val store: ElasticsearchStore = ElasticsearchStore(elasticsearchUrl) + * ``` */ class ElasticsearchStore( private val serverUrl: String = "http://localhost:9200",