docs: udpate docs for VectorStore

unit-mesh · Oct 17, 2023 · 18a5eac · 18a5eac
1 parent d249a0d
commit 18a5eac
Show file tree

Hide file tree

Showing 13 changed files with 258 additions and 77 deletions.
diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/document/DocumentParser.kt
@@ -3,19 +3,24 @@ package cc.unitmesh.rag.document
 import java.io.InputStream
 
 /**
- * > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。
+ * The DocumentParser interface is responsible for parsing different types of documents.
  *
- *  Parse the given input stream and return a list of documents.
+ * This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j).
  *
- *  返回多个 [Document]:
- *  - [cc.unitmesh.rag.document.DocumentType.PPT]
+ * The DocumentParser interface returns multiple types of documents:
+ * - [cc.unitmesh.rag.document.DocumentType.PPT]
  *
- * 返回单个 [Document]:
- * - [cc.unitmesh.rag.document.DocumentType.PDF]
- * - [cc.unitmesh.rag.document.DocumentType.TXT]
- * - [cc.unitmesh.rag.document.DocumentType.HTML]
- * - [cc.unitmesh.rag.document.DocumentType.DOC]
+ * The DocumentParser interface also returns single types of documents:
+ * - [cc.unitmesh.rag.document.DocumentType.PDF]
+ * - [cc.unitmesh.rag.document.DocumentType.DOC]
+ * - [cc.unitmesh.rag.document.DocumentType.XLS]
+ * - [cc.unitmesh.rag.document.DocumentType.MD]
+ * - [cc.unitmesh.rag.document.DocumentType.HTML]
+ * - [cc.unitmesh.rag.document.DocumentType.TXT]
  *
+ * The DocumentParser interface provides a method to parse the input stream of a document.
+ *
+ * @property DOCUMENT_TYPE The constant value representing the document type.
  */
 interface DocumentParser {
  fun parse(inputStream: InputStream): List<Document>

diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt
@@ -20,16 +20,70 @@ package cc.unitmesh.rag.store
 import cc.unitmesh.nlp.embedding.Embedding
 
 
+/**
+ * 向量数据库的核心是将数据表示为向量，并使用向量空间中的距离度量来进行数据的存储、检索和分析。
+ *
+ * An interface for an Embedding Store, which is a vector database used to store and manage embeddings.
+ * Embeddings are high-dimensional vector representations of data points, which can be used in various
+ * machine learning and data retrieval tasks.
+ *
+ * @param Embedded The type of data embedded in the store.
+ */
 interface EmbeddingStore<Embedded> {
+ /**
+ * Adds an embedding to the store and returns its unique identifier.
+ *
+ * @param embedding The embedding to be added.
+ * @return A unique identifier associated with the added embedding.
+ */
  fun add(embedding: Embedding): String
+
+ /**
+ * Adds an embedding to the store with a specified identifier.
+ *
+ * @param id The unique identifier for the embedding.
+ * @param embedding The embedding to be added.
+ */
  fun add(id: String, embedding: Embedding)
+
+ /**
+ * Adds an embedding to the store and associates it with the provided embedded data.
+ *
+ * @param embedding The embedding to be added.
+ * @param embedded The data embedded in the store.
+ * @return A unique identifier associated with the added embedding.
+ */
  fun add(embedding: Embedding, embedded: Embedded): String
+
+ /**
+ * Adds a list of embeddings to the store and returns a list of unique identifiers.
+ *
+ * @param embeddings The list of embeddings to be added.
+ * @return A list of unique identifiers associated with the added embeddings, in the same order.
+ */
  fun addAll(embeddings: List<Embedding>): List<String>
- fun addAll(embeddings: List<Embedding>, embedded: List<Embedded>): List<String>
 
- fun findRelevant(referenceEmbedding: Embedding, maxResults: Int): List<EmbeddingMatch<Embedded>> {
- return findRelevant(referenceEmbedding, maxResults, 0.0)
- }
+ /**
+ * Adds a list of embeddings to the store and associates them with a list of embedded data.
+ *
+ * @param embeddings The list of embeddings to be added.
+ * @param embedded The list of data embedded in the store.
+ * @return A list of unique identifiers associated with the added embeddings, in the same order.
+ */
+ fun addAll(embeddings: List<Embedding>, embedded: List<Embedded>): List<String>
 
- fun findRelevant(referenceEmbedding: Embedding, maxResults: Int, minScore: Double): List<EmbeddingMatch<Embedded>>
-}
+ /**
+ * Find relevant embeddings in the store based on a reference embedding, with a maximum number of results.
+ * An optional minimum score can be specified to filter results.
+ *
+ * @param referenceEmbedding The reference embedding to compare against.
+ * @param maxResults The maximum number of results to retrieve.
+ * @param minScore The minimum similarity score required to include a result (default is 0.0).
+ * @return A list of [EmbeddingMatch] objects representing relevant matches.
+ */
+ fun findRelevant(
+ referenceEmbedding: Embedding,
+ maxResults: Int,
+ minScore: Double = 0.0,
+ ): List<EmbeddingMatch<Embedded>>
+}
diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStore.kt
@@ -12,6 +12,13 @@ data class Entry<Embedded>(
  var embedded: Embedded?,
 )
 
+/**
+ * The `InMemoryEmbeddingStore` class is an implementation of the `EmbeddingStore` interface that stores embeddings in memory.
+ * It provides methods to add embeddings, retrieve relevant embeddings, and manage the storage of embeddings.
+ *
+ * @param Embedded the type of the embedded object associated with each embedding
+ *
+ */
 class InMemoryEmbeddingStore<Embedded> : EmbeddingStore<Embedded> {
  private val entries: MutableList<Entry<Embedded>> = ArrayList()
  override fun add(embedding: Embedding): String {

diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStore.kt
@@ -8,6 +8,25 @@ import java.util.*
 
 /**
  * A simple in-memory English implementation of [EmbeddingStore].
+ *
+ * This class represents an in-memory storage for English text embeddings. It implements the [EmbeddingStore] interface,
+ * which provides methods for adding and retrieving embeddings.
+ *
+ * The class stores the embeddings in a mutable list of [Entry] objects. Each entry contains an ID, an embedding, and an
+ * optional embedded object. The ID is generated using the [IdUtil.uuid] method. The class provides multiple overloaded
+ * methods for adding embeddings, allowing the user to specify the ID and the embedded object.
+ *
+ * The class also provides methods for adding multiple embeddings at once. The [addAll] method takes a list of embeddings
+ * and adds them to the store, returning a list of IDs for the added embeddings. There is also an overloaded version of
+ * [addAll] that takes a list of embeddings and a list of embedded objects, ensuring that both lists have the same size.
+ *
+ * The [findRelevant] method allows the user to find the most relevant embeddings in the store based on a reference
+ * embedding. It takes the reference embedding, the maximum number of results to return, and the minimum relevance score
+ * as parameters. It calculates the cosine similarity between the reference embedding and each entry in the store, and
+ * filters the entries based on the minimum score. The method returns a list of [EmbeddingMatch] objects, sorted by
+ * relevance score in descending order.
+ *
+ * @param Embedded the type of the embedded object associated with each embedding
  */
 class InMemoryEnglishTextStore<Embedded> : EmbeddingStore<Embedded> {
  private val entries: MutableList<Entry<Embedded>> = ArrayList()

diff --git a/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt b/cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt
diff --git a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEmbeddingStoreTest.kt
@@ -1,14 +1,31 @@
 package cc.unitmesh.rag.store;
 
+import cc.unitmesh.docs.SampleCode
 import cc.unitmesh.nlp.embedding.Embedding
 import cc.unitmesh.nlp.embedding.toEmbedding
 import cc.unitmesh.rag.document.Document
 import org.assertj.core.api.Assertions
 import org.junit.jupiter.api.Test
 
 class InMemoryEmbeddingStoreTest {
+ @Test
+ @SampleCode
+ fun it_works() {
+ // start-sample
+ val embeddingStore: EmbeddingStore<Document> = InMemoryEmbeddingStore()
+
+ embeddingStore.add(toEmbedding(floatArrayOf(1f, 3f)), Document.from("first"))
+ embeddingStore.add(toEmbedding(floatArrayOf(2f, 2f)), Document.from("second"))
+
+ val relevant: List<EmbeddingMatch<Document>> =
+ embeddingStore.findRelevant(toEmbedding(floatArrayOf(4f, 0f)), 2)
+
+ // end-sample
+
+ }
 
  @Test
+ @SampleCode(name = "文本嵌入示例", content = "")
  fun should_add_embedding_with_generated_id() {
  val embeddingStore: EmbeddingStore<Document> = InMemoryEmbeddingStore()
 

diff --git a/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt b/cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/InMemoryEnglishTextStoreTest.kt
@@ -1,14 +1,16 @@
 package cc.unitmesh.rag.store;
 
+import cc.unitmesh.docs.SampleCode
 import cc.unitmesh.nlp.embedding.text.EnglishTextEmbeddingProvider
 import org.junit.jupiter.api.Assertions.*
 import org.junit.jupiter.api.Test
 
 class InMemoryEnglishTextStoreTest {
  private val provider = EnglishTextEmbeddingProvider()
  @Test
+ @SampleCode
  fun should_find_relevant_embeddings() {
- // given
+ // start-sample
  val store = InMemoryEnglishTextStore<String>()
 
  store.add(provider.embed("this is a example"), "this is a example")
@@ -19,6 +21,7 @@ class InMemoryEnglishTextStoreTest {
  val minScore = 0.5
 
  val embedding4 = provider.embed("this is a cat")
+ // end-sample
 
  // when
  val relevantEmbeddings = store.findRelevant(embedding4, maxResults, minScore)

diff --git a/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt b/docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt
@@ -20,12 +20,14 @@ class KDocGen(private val rootDir: Path) : DocGenerator() {
  private var fileNodes = listOf<FileASTNode>()
 
  override fun execute(): List<RootDocContent> {
- fileNodes = fileNodes + processor.process(rootDir)
+ fileNodes += processor.process(rootDir)
  return extractNodes(fileNodes)
  }
 
- fun appendNodes(dir: Path) {
- fileNodes = fileNodes + processor.process(dir)
+ fun appendNodes(vararg dirs: Path) {
+ dirs.forEach {
+ fileNodes += processor.process(it)
+ }
  }
 
  fun extractNodes(fileASTNodes: List<FileASTNode>): List<RootDocContent> {

diff --git a/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt b/docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt
@@ -16,6 +16,7 @@ class Runner : CliktCommand() {
  processRagScript(rootDir)
  processPromptScript(rootDir)
  processDocumentModule(rootDir)
+ processVectorStoreModule(rootDir)
  }
 
  private val warningLog =
@@ -89,6 +90,35 @@ class Runner : CliktCommand() {
  }
  }
 
+ private fun processVectorStoreModule(rootDir: Path) {
+ val documentDir = rootDir.resolve("cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/")
+ val kDocGen = KDocGen(documentDir)
+
+ kDocGen.appendNodes(
+ rootDir.resolve("cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/"),
+ rootDir.resolve("rag-modules/store-elasticsearch"),
+ rootDir.resolve("rag-modules/store-milvus"),
+ rootDir.resolve("rag-modules/store-pinecone")
+ )
+
+ val documentDocs = kDocGen.execute()
+
+ val docs = renderDocs(documentDocs)
+ val outputDir = rootDir.resolve("docs/rag/")
+ var index = 11
+ docs.forEach { (name, content) ->
+ var output =
+ CustomJekyllFrontMatter("Vector Store", "Retrieval Augmented Generation", index, "/rag/vector-store")
+ .toMarkdown()
+
+ output = "$output$warningLog"
+
+ val outputFile = outputDir.resolve("vector-store.md")
+ outputFile.toFile().writeText(output + "\n\n" + content)
+ index += 1
+ }
+ }
+
 
  private fun renderDocs(rootDocContents: List<RootDocContent>): Map<String, String> {
  return rootDocContents.associate { treeDoc ->

diff --git a/docs/rag-script/workflow.md b/docs/rag-script/workflow.md
@@ -171,8 +171,7 @@ indexing {
 
 ## querying 
 
-Querying is a function block for querying data for the workflow. You don't need to call it as block.
-
+querying is a function block for querying data for the workflow. you don't need to call it as block.
 for example:
 ```kotlin
 querying {

diff --git a/docs/rag/document.md b/docs/rag/document.md
@@ -11,18 +11,23 @@ Automatically generated documentation; use the command `./gradlew :docs-builder:
 
 # DocumentParser 
 
-> > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。
+> The DocumentParser interface is responsible for parsing different types of documents.
 
-Parse the given input stream and return a list of documents.
+This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j).
 
-返回多个 [Document]:
+The DocumentParser interface returns multiple types of documents:
 - [cc.unitmesh.rag.document.DocumentType.PPT]
 
-返回单个 [Document]:
+The DocumentParser interface also returns single types of documents:
 - [cc.unitmesh.rag.document.DocumentType.PDF]
-- [cc.unitmesh.rag.document.DocumentType.TXT]
-- [cc.unitmesh.rag.document.DocumentType.HTML]
 - [cc.unitmesh.rag.document.DocumentType.DOC]
+- [cc.unitmesh.rag.document.DocumentType.XLS]
+- [cc.unitmesh.rag.document.DocumentType.MD]
+- [cc.unitmesh.rag.document.DocumentType.HTML]
+- [cc.unitmesh.rag.document.DocumentType.TXT]
+
+The DocumentParser interface provides a method to parse the input stream of a document.
+
 
 
 ## MdDocumentParser