From c0e4abc4e9ce7e14b8c9609fdbb50bcdd9399624 Mon Sep 17 00:00:00 2001
From: Laurent Sorber <laurent.sorber@gmail.com>
Date: Mon, 25 Nov 2024 13:59:21 +0100
Subject: [PATCH] feat: improve late chunking context size

---
 README.md               | 8 ++++----
 poetry.lock             | 8 ++++----
 pyproject.toml          | 2 +-
 src/raglite/_config.py  | 4 ++--
 src/raglite/_litellm.py | 4 ++++
 tests/conftest.py       | 2 +-
 tests/test_rerank.py    | 6 +++---
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 2cf9e4c..fc41b1c 100644
--- a/README.md
+++ b/README.md
@@ -45,8 +45,8 @@ pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_
 Next, it is optional but recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with:
 
 ```sh
-# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.2.88 is supported right now):
-LLAMA_CPP_PYTHON_VERSION=0.2.88
+# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.3.2 is supported right now):
+LLAMA_CPP_PYTHON_VERSION=0.3.2
 PYTHON_VERSION=310
 ACCELERATOR=metal|cu121|cu122|cu123|cu124
 PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64
@@ -114,7 +114,7 @@ my_config = RAGLiteConfig(
 my_config = RAGLiteConfig(
     db_url="sqlite:///raglite.sqlite",
     llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192",
-    embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf",
+    embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096",
 )
 ```
 
@@ -232,7 +232,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro
 raglite chainlit \
     --db_url sqlite:///raglite.sqlite \
     --llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
-    --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf
+    --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096
 ```
 
 To use an API-based LLM, make sure to include your credentials in a `.env` file or supply them inline:
diff --git a/poetry.lock b/poetry.lock
index 4cbc267..5626d81 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2591,12 +2591,12 @@ pydantic = ">=1,<3"
 
 [[package]]
 name = "llama-cpp-python"
-version = "0.2.88"
+version = "0.3.2"
 description = "Python bindings for the llama.cpp library"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "llama_cpp_python-0.2.88.tar.gz", hash = "sha256:b031181d069aa61b3bbec415037b1f060d6d5b36951815f438285c4c85ca693e"},
+    {file = "llama_cpp_python-0.3.2.tar.gz", hash = "sha256:8fbf246a55a999f45015ed0d48f91b4ae04ae959827fac1cd6ac6ec65aed2e2f"},
 ]
 
 [package.dependencies]
@@ -2609,7 +2609,7 @@ typing-extensions = ">=4.5.0"
 all = ["llama_cpp_python[dev,server,test]"]
 dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"]
 server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"]
-test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"]
+test = ["fastapi (>=0.100.0)", "httpx (>=0.24.1)", "huggingface-hub (>=0.23.0)", "pydantic-settings (>=2.0.1)", "pytest (>=7.4.0)", "scipy (>=1.10)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)"]
 
 [[package]]
 name = "llvmlite"
@@ -6813,4 +6813,4 @@ ragas = ["ragas"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "bd140a94bea25c626ad990faf55886d0ca589a9524125a0e7d1aa607e9aa1609"
+content-hash = "ff8a8596ac88ae5406234f08810e2e4d654714af0aa3663451e988a2cf6ef51e"
diff --git a/pyproject.toml b/pyproject.toml
index f933ddb..edd8bb9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ spacy = ">=3.7.0,<3.8.0"
 # Large Language Models:
 huggingface-hub = ">=0.22.0"
 litellm = ">=1.47.1"
-llama-cpp-python = ">=0.2.88"
+llama-cpp-python = ">=0.3.2"
 pydantic = ">=2.7.0"
 # Approximate Nearest Neighbors:
 pynndescent = ">=0.5.12"
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
index 9c73204..8e860b0 100644
--- a/src/raglite/_config.py
+++ b/src/raglite/_config.py
@@ -33,9 +33,9 @@ class RAGLiteConfig:
     # Embedder config used for indexing.
     embedder: str = field(
         default_factory=lambda: (  # Nomic-embed may be better if only English is used.
-            "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf"
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096"
             if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4  # noqa: PLR2004
-            else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf"
+            else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096"
         )
     )
     embedder_normalize: bool = True
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
index 6bcc97e..e8b338a 100644
--- a/src/raglite/_litellm.py
+++ b/src/raglite/_litellm.py
@@ -104,6 +104,10 @@ def llm(model: str, **kwargs: Any) -> Llama:
                 n_ctx=n_ctx,
                 n_gpu_layers=-1,
                 verbose=False,
+                # Workaround to enable long context embedding models [1].
+                # [1] https://github.com/abetlen/llama-cpp-python/issues/1762
+                n_batch=n_ctx if n_ctx > 0 else 2048,
+                n_ubatch=n_ctx if n_ctx > 0 else 2048,
                 **kwargs,
             )
         # Enable caching.
diff --git a/tests/conftest.py b/tests/conftest.py
index 256bad8..eecc8c9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -69,7 +69,7 @@ def database(request: pytest.FixtureRequest) -> str:
     scope="session",
     params=[
         pytest.param(
-            "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf",
+            "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096",
             id="bge_m3",
         ),
         pytest.param(
diff --git a/tests/test_rerank.py b/tests/test_rerank.py
index 20357c7..9d37e92 100644
--- a/tests/test_rerank.py
+++ b/tests/test_rerank.py
@@ -40,7 +40,7 @@ def test_reranker(
     )
     # Search for a query.
     query = "What does it mean for two events to be simultaneous?"
-    chunk_ids, _ = hybrid_search(query, num_results=3, config=raglite_test_config)
+    chunk_ids, _ = hybrid_search(query, num_results=10, config=raglite_test_config)
     # Retrieve the chunks.
     chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
     assert all(isinstance(chunk, Chunk) for chunk in chunks)
@@ -48,8 +48,8 @@ def test_reranker(
     # Rerank the chunks given an inverted chunk order.
     reranked_chunks = rerank_chunks(query, chunks[::-1], config=raglite_test_config)
     if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
-        assert reranked_chunks[0] == chunks[0]
+        assert reranked_chunks[0] in chunks[:3]
     # Test that we can also rerank given the chunk_ids only.
     reranked_chunks = rerank_chunks(query, chunk_ids[::-1], config=raglite_test_config)
     if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
-        assert reranked_chunks[0] == chunks[0]
+        assert reranked_chunks[0] in chunks[:3]