Skip to content

Commit

Permalink
docs: udpate docs for VectorStore
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Oct 17, 2023
1 parent d249a0d commit 18a5eac
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,24 @@ package cc.unitmesh.rag.document
import java.io.InputStream

/**
* > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。
* The DocumentParser interface is responsible for parsing different types of documents.
*
* Parse the given input stream and return a list of documents.
* This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j).
*
* 返回多个 [Document]:
* - [cc.unitmesh.rag.document.DocumentType.PPT]
* The DocumentParser interface returns multiple types of documents:
* - [cc.unitmesh.rag.document.DocumentType.PPT]
*
* 返回单个 [Document]:
* - [cc.unitmesh.rag.document.DocumentType.PDF]
* - [cc.unitmesh.rag.document.DocumentType.TXT]
* - [cc.unitmesh.rag.document.DocumentType.HTML]
* - [cc.unitmesh.rag.document.DocumentType.DOC]
* The DocumentParser interface also returns single types of documents:
* - [cc.unitmesh.rag.document.DocumentType.PDF]
* - [cc.unitmesh.rag.document.DocumentType.DOC]
* - [cc.unitmesh.rag.document.DocumentType.XLS]
* - [cc.unitmesh.rag.document.DocumentType.MD]
* - [cc.unitmesh.rag.document.DocumentType.HTML]
* - [cc.unitmesh.rag.document.DocumentType.TXT]
*
* The DocumentParser interface provides a method to parse the input stream of a document.
*
* @property DOCUMENT_TYPE The constant value representing the document type.
*/
interface DocumentParser {
fun parse(inputStream: InputStream): List<Document>
Expand Down
66 changes: 60 additions & 6 deletions cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/EmbeddingStore.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,70 @@ package cc.unitmesh.rag.store
import cc.unitmesh.nlp.embedding.Embedding


/**
* 向量数据库的核心是将数据表示为向量,并使用向量空间中的距离度量来进行数据的存储、检索和分析。
*
* An interface for an Embedding Store, which is a vector database used to store and manage embeddings.
* Embeddings are high-dimensional vector representations of data points, which can be used in various
* machine learning and data retrieval tasks.
*
* @param Embedded The type of data embedded in the store.
*/
interface EmbeddingStore<Embedded> {
/**
* Adds an embedding to the store and returns its unique identifier.
*
* @param embedding The embedding to be added.
* @return A unique identifier associated with the added embedding.
*/
fun add(embedding: Embedding): String

/**
* Adds an embedding to the store with a specified identifier.
*
* @param id The unique identifier for the embedding.
* @param embedding The embedding to be added.
*/
fun add(id: String, embedding: Embedding)

/**
* Adds an embedding to the store and associates it with the provided embedded data.
*
* @param embedding The embedding to be added.
* @param embedded The data embedded in the store.
* @return A unique identifier associated with the added embedding.
*/
fun add(embedding: Embedding, embedded: Embedded): String

/**
* Adds a list of embeddings to the store and returns a list of unique identifiers.
*
* @param embeddings The list of embeddings to be added.
* @return A list of unique identifiers associated with the added embeddings, in the same order.
*/
fun addAll(embeddings: List<Embedding>): List<String>
fun addAll(embeddings: List<Embedding>, embedded: List<Embedded>): List<String>

fun findRelevant(referenceEmbedding: Embedding, maxResults: Int): List<EmbeddingMatch<Embedded>> {
return findRelevant(referenceEmbedding, maxResults, 0.0)
}
/**
* Adds a list of embeddings to the store and associates them with a list of embedded data.
*
* @param embeddings The list of embeddings to be added.
* @param embedded The list of data embedded in the store.
* @return A list of unique identifiers associated with the added embeddings, in the same order.
*/
fun addAll(embeddings: List<Embedding>, embedded: List<Embedded>): List<String>

fun findRelevant(referenceEmbedding: Embedding, maxResults: Int, minScore: Double): List<EmbeddingMatch<Embedded>>
}
/**
* Find relevant embeddings in the store based on a reference embedding, with a maximum number of results.
* An optional minimum score can be specified to filter results.
*
* @param referenceEmbedding The reference embedding to compare against.
* @param maxResults The maximum number of results to retrieve.
* @param minScore The minimum similarity score required to include a result (default is 0.0).
* @return A list of [EmbeddingMatch] objects representing relevant matches.
*/
fun findRelevant(
referenceEmbedding: Embedding,
maxResults: Int,
minScore: Double = 0.0,
): List<EmbeddingMatch<Embedded>>
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ data class Entry<Embedded>(
var embedded: Embedded?,
)

/**
* The `InMemoryEmbeddingStore` class is an implementation of the `EmbeddingStore` interface that stores embeddings in memory.
* It provides methods to add embeddings, retrieve relevant embeddings, and manage the storage of embeddings.
*
* @param Embedded the type of the embedded object associated with each embedding
*
*/
class InMemoryEmbeddingStore<Embedded> : EmbeddingStore<Embedded> {
private val entries: MutableList<Entry<Embedded>> = ArrayList()
override fun add(embedding: Embedding): String {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,25 @@ import java.util.*

/**
* A simple in-memory English implementation of [EmbeddingStore].
*
* This class represents an in-memory storage for English text embeddings. It implements the [EmbeddingStore] interface,
* which provides methods for adding and retrieving embeddings.
*
* The class stores the embeddings in a mutable list of [Entry] objects. Each entry contains an ID, an embedding, and an
* optional embedded object. The ID is generated using the [IdUtil.uuid] method. The class provides multiple overloaded
* methods for adding embeddings, allowing the user to specify the ID and the embedded object.
*
* The class also provides methods for adding multiple embeddings at once. The [addAll] method takes a list of embeddings
* and adds them to the store, returning a list of IDs for the added embeddings. There is also an overloaded version of
* [addAll] that takes a list of embeddings and a list of embedded objects, ensuring that both lists have the same size.
*
* The [findRelevant] method allows the user to find the most relevant embeddings in the store based on a reference
* embedding. It takes the reference embedding, the maximum number of results to return, and the minimum relevance score
* as parameters. It calculates the cosine similarity between the reference embedding and each entry in the store, and
* filters the entries based on the minimum score. The method returns a list of [EmbeddingMatch] objects, sorted by
* relevance score in descending order.
*
* @param Embedded the type of the embedded object associated with each embedding
*/
class InMemoryEnglishTextStore<Embedded> : EmbeddingStore<Embedded> {
private val entries: MutableList<Entry<Embedded>> = ArrayList()
Expand Down
16 changes: 0 additions & 16 deletions cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/VectorStore.kt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,14 +1,31 @@
package cc.unitmesh.rag.store;

import cc.unitmesh.docs.SampleCode
import cc.unitmesh.nlp.embedding.Embedding
import cc.unitmesh.nlp.embedding.toEmbedding
import cc.unitmesh.rag.document.Document
import org.assertj.core.api.Assertions
import org.junit.jupiter.api.Test

class InMemoryEmbeddingStoreTest {
@Test
@SampleCode
fun it_works() {
// start-sample
val embeddingStore: EmbeddingStore<Document> = InMemoryEmbeddingStore()

embeddingStore.add(toEmbedding(floatArrayOf(1f, 3f)), Document.from("first"))
embeddingStore.add(toEmbedding(floatArrayOf(2f, 2f)), Document.from("second"))

val relevant: List<EmbeddingMatch<Document>> =
embeddingStore.findRelevant(toEmbedding(floatArrayOf(4f, 0f)), 2)

// end-sample

}

@Test
@SampleCode(name = "文本嵌入示例", content = "")
fun should_add_embedding_with_generated_id() {
val embeddingStore: EmbeddingStore<Document> = InMemoryEmbeddingStore()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package cc.unitmesh.rag.store;

import cc.unitmesh.docs.SampleCode
import cc.unitmesh.nlp.embedding.text.EnglishTextEmbeddingProvider
import org.junit.jupiter.api.Assertions.*
import org.junit.jupiter.api.Test

class InMemoryEnglishTextStoreTest {
private val provider = EnglishTextEmbeddingProvider()
@Test
@SampleCode
fun should_find_relevant_embeddings() {
// given
// start-sample
val store = InMemoryEnglishTextStore<String>()

store.add(provider.embed("this is a example"), "this is a example")
Expand All @@ -19,6 +21,7 @@ class InMemoryEnglishTextStoreTest {
val minScore = 0.5

val embedding4 = provider.embed("this is a cat")
// end-sample

// when
val relevantEmbeddings = store.findRelevant(embedding4, maxResults, minScore)
Expand Down
8 changes: 5 additions & 3 deletions docs-builder/src/main/kotlin/cc/unitmesh/docs/KDocGen.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ class KDocGen(private val rootDir: Path) : DocGenerator() {
private var fileNodes = listOf<FileASTNode>()

override fun execute(): List<RootDocContent> {
fileNodes = fileNodes + processor.process(rootDir)
fileNodes += processor.process(rootDir)
return extractNodes(fileNodes)
}

fun appendNodes(dir: Path) {
fileNodes = fileNodes + processor.process(dir)
fun appendNodes(vararg dirs: Path) {
dirs.forEach {
fileNodes += processor.process(it)
}
}

fun extractNodes(fileASTNodes: List<FileASTNode>): List<RootDocContent> {
Expand Down
30 changes: 30 additions & 0 deletions docs-builder/src/main/kotlin/cc/unitmesh/docs/Runner.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Runner : CliktCommand() {
processRagScript(rootDir)
processPromptScript(rootDir)
processDocumentModule(rootDir)
processVectorStoreModule(rootDir)
}

private val warningLog =
Expand Down Expand Up @@ -89,6 +90,35 @@ class Runner : CliktCommand() {
}
}

private fun processVectorStoreModule(rootDir: Path) {
val documentDir = rootDir.resolve("cocoa-core/src/main/kotlin/cc/unitmesh/rag/store/")
val kDocGen = KDocGen(documentDir)

kDocGen.appendNodes(
rootDir.resolve("cocoa-core/src/test/kotlin/cc/unitmesh/rag/store/"),
rootDir.resolve("rag-modules/store-elasticsearch"),
rootDir.resolve("rag-modules/store-milvus"),
rootDir.resolve("rag-modules/store-pinecone")
)

val documentDocs = kDocGen.execute()

val docs = renderDocs(documentDocs)
val outputDir = rootDir.resolve("docs/rag/")
var index = 11
docs.forEach { (name, content) ->
var output =
CustomJekyllFrontMatter("Vector Store", "Retrieval Augmented Generation", index, "/rag/vector-store")
.toMarkdown()

output = "$output$warningLog"

val outputFile = outputDir.resolve("vector-store.md")
outputFile.toFile().writeText(output + "\n\n" + content)
index += 1
}
}


private fun renderDocs(rootDocContents: List<RootDocContent>): Map<String, String> {
return rootDocContents.associate { treeDoc ->
Expand Down
3 changes: 1 addition & 2 deletions docs/rag-script/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ indexing {

## querying

Querying is a function block for querying data for the workflow. You don't need to call it as block.

querying is a function block for querying data for the workflow. you don't need to call it as block.
for example:
```kotlin
querying {
Expand Down
17 changes: 11 additions & 6 deletions docs/rag/document.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,23 @@ Automatically generated documentation; use the command `./gradlew :docs-builder:

# DocumentParser

> > 当前的 Chocolate Factory 主要基于 [Langchain4j](https://github.com/langchain4j/langchain4j) 的实现。
> The DocumentParser interface is responsible for parsing different types of documents.
Parse the given input stream and return a list of documents.
This interface is mainly based on the implementation of [Langchain4j](https://github.com/langchain4j/langchain4j).

返回多个 [Document]:
The DocumentParser interface returns multiple types of documents:
- [cc.unitmesh.rag.document.DocumentType.PPT]

返回单个 [Document]:
The DocumentParser interface also returns single types of documents:
- [cc.unitmesh.rag.document.DocumentType.PDF]
- [cc.unitmesh.rag.document.DocumentType.TXT]
- [cc.unitmesh.rag.document.DocumentType.HTML]
- [cc.unitmesh.rag.document.DocumentType.DOC]
- [cc.unitmesh.rag.document.DocumentType.XLS]
- [cc.unitmesh.rag.document.DocumentType.MD]
- [cc.unitmesh.rag.document.DocumentType.HTML]
- [cc.unitmesh.rag.document.DocumentType.TXT]

The DocumentParser interface provides a method to parse the input stream of a document.



## MdDocumentParser
Expand Down
Loading

0 comments on commit 18a5eac

Please sign in to comment.