From c0e4abc4e9ce7e14b8c9609fdbb50bcdd9399624 Mon Sep 17 00:00:00 2001 From: Laurent Sorber Date: Mon, 25 Nov 2024 13:59:21 +0100 Subject: [PATCH] feat: improve late chunking context size --- README.md | 8 ++++---- poetry.lock | 8 ++++---- pyproject.toml | 2 +- src/raglite/_config.py | 4 ++-- src/raglite/_litellm.py | 4 ++++ tests/conftest.py | 2 +- tests/test_rerank.py | 6 +++--- 7 files changed, 19 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2cf9e4c..fc41b1c 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,8 @@ pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ Next, it is optional but recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with: ```sh -# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.2.88 is supported right now): -LLAMA_CPP_PYTHON_VERSION=0.2.88 +# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.3.2 is supported right now): +LLAMA_CPP_PYTHON_VERSION=0.3.2 PYTHON_VERSION=310 ACCELERATOR=metal|cu121|cu122|cu123|cu124 PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64 @@ -114,7 +114,7 @@ my_config = RAGLiteConfig( my_config = RAGLiteConfig( db_url="sqlite:///raglite.sqlite", llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192", - embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf", + embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096", ) ``` @@ -232,7 +232,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro raglite chainlit \ --db_url sqlite:///raglite.sqlite \ --llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \ - --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf + --embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096 ``` To use an API-based LLM, make sure to include your credentials in a `.env` file or supply them inline: diff --git a/poetry.lock b/poetry.lock index 4cbc267..5626d81 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2591,12 +2591,12 @@ pydantic = ">=1,<3" [[package]] name = "llama-cpp-python" -version = "0.2.88" +version = "0.3.2" description = "Python bindings for the llama.cpp library" optional = false python-versions = ">=3.8" files = [ - {file = "llama_cpp_python-0.2.88.tar.gz", hash = "sha256:b031181d069aa61b3bbec415037b1f060d6d5b36951815f438285c4c85ca693e"}, + {file = "llama_cpp_python-0.3.2.tar.gz", hash = "sha256:8fbf246a55a999f45015ed0d48f91b4ae04ae959827fac1cd6ac6ec65aed2e2f"}, ] [package.dependencies] @@ -2609,7 +2609,7 @@ typing-extensions = ">=4.5.0" all = ["llama_cpp_python[dev,server,test]"] dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] +test = ["fastapi (>=0.100.0)", "httpx (>=0.24.1)", "huggingface-hub (>=0.23.0)", "pydantic-settings (>=2.0.1)", "pytest (>=7.4.0)", "scipy (>=1.10)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)"] [[package]] name = "llvmlite" @@ -6813,4 +6813,4 @@ ragas = ["ragas"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "bd140a94bea25c626ad990faf55886d0ca589a9524125a0e7d1aa607e9aa1609" +content-hash = "ff8a8596ac88ae5406234f08810e2e4d654714af0aa3663451e988a2cf6ef51e" diff --git a/pyproject.toml b/pyproject.toml index f933ddb..edd8bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ spacy = ">=3.7.0,<3.8.0" # Large Language Models: huggingface-hub = ">=0.22.0" litellm = ">=1.47.1" -llama-cpp-python = ">=0.2.88" +llama-cpp-python = ">=0.3.2" pydantic = ">=2.7.0" # Approximate Nearest Neighbors: pynndescent = ">=0.5.12" diff --git a/src/raglite/_config.py b/src/raglite/_config.py index 9c73204..8e860b0 100644 --- a/src/raglite/_config.py +++ b/src/raglite/_config.py @@ -33,9 +33,9 @@ class RAGLiteConfig: # Embedder config used for indexing. embedder: str = field( default_factory=lambda: ( # Nomic-embed may be better if only English is used. - "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf" + "llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096" if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4 # noqa: PLR2004 - else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf" + else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096" ) ) embedder_normalize: bool = True diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py index 6bcc97e..e8b338a 100644 --- a/src/raglite/_litellm.py +++ b/src/raglite/_litellm.py @@ -104,6 +104,10 @@ def llm(model: str, **kwargs: Any) -> Llama: n_ctx=n_ctx, n_gpu_layers=-1, verbose=False, + # Workaround to enable long context embedding models [1]. + # [1] https://github.com/abetlen/llama-cpp-python/issues/1762 + n_batch=n_ctx if n_ctx > 0 else 2048, + n_ubatch=n_ctx if n_ctx > 0 else 2048, **kwargs, ) # Enable caching. diff --git a/tests/conftest.py b/tests/conftest.py index 256bad8..eecc8c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,7 +69,7 @@ def database(request: pytest.FixtureRequest) -> str: scope="session", params=[ pytest.param( - "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf", + "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096", id="bge_m3", ), pytest.param( diff --git a/tests/test_rerank.py b/tests/test_rerank.py index 20357c7..9d37e92 100644 --- a/tests/test_rerank.py +++ b/tests/test_rerank.py @@ -40,7 +40,7 @@ def test_reranker( ) # Search for a query. query = "What does it mean for two events to be simultaneous?" - chunk_ids, _ = hybrid_search(query, num_results=3, config=raglite_test_config) + chunk_ids, _ = hybrid_search(query, num_results=10, config=raglite_test_config) # Retrieve the chunks. chunks = retrieve_chunks(chunk_ids, config=raglite_test_config) assert all(isinstance(chunk, Chunk) for chunk in chunks) @@ -48,8 +48,8 @@ def test_reranker( # Rerank the chunks given an inverted chunk order. reranked_chunks = rerank_chunks(query, chunks[::-1], config=raglite_test_config) if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder: - assert reranked_chunks[0] == chunks[0] + assert reranked_chunks[0] in chunks[:3] # Test that we can also rerank given the chunk_ids only. reranked_chunks = rerank_chunks(query, chunk_ids[::-1], config=raglite_test_config) if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder: - assert reranked_chunks[0] == chunks[0] + assert reranked_chunks[0] in chunks[:3]