Skip to content

Commit

Permalink
feat: improve late chunking context size
Browse files Browse the repository at this point in the history
  • Loading branch information
lsorber committed Nov 25, 2024
1 parent 003967b commit c0e4abc
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 15 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_
Next, it is optional but recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with:

```sh
# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.2.88 is supported right now):
LLAMA_CPP_PYTHON_VERSION=0.2.88
# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.3.2 is supported right now):
LLAMA_CPP_PYTHON_VERSION=0.3.2
PYTHON_VERSION=310
ACCELERATOR=metal|cu121|cu122|cu123|cu124
PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64
Expand Down Expand Up @@ -114,7 +114,7 @@ my_config = RAGLiteConfig(
my_config = RAGLiteConfig(
db_url="sqlite:///raglite.sqlite",
llm="llama-cpp-python/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/*Q4_K_M.gguf@8192",
embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf",
embedder="llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096",
)
```

Expand Down Expand Up @@ -232,7 +232,7 @@ You can specify the database URL, LLM, and embedder directly in the Chainlit fro
raglite chainlit \
--db_url sqlite:///raglite.sqlite \
--llm llama-cpp-python/bartowski/Llama-3.2-3B-Instruct-GGUF/*Q4_K_M.gguf@4096 \
--embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf
--embedder llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096
```

To use an API-based LLM, make sure to include your credentials in a `.env` file or supply them inline:
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spacy = ">=3.7.0,<3.8.0"
# Large Language Models:
huggingface-hub = ">=0.22.0"
litellm = ">=1.47.1"
llama-cpp-python = ">=0.2.88"
llama-cpp-python = ">=0.3.2"
pydantic = ">=2.7.0"
# Approximate Nearest Neighbors:
pynndescent = ">=0.5.12"
Expand Down
4 changes: 2 additions & 2 deletions src/raglite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ class RAGLiteConfig:
# Embedder config used for indexing.
embedder: str = field(
default_factory=lambda: ( # Nomic-embed may be better if only English is used.
"llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf"
"llama-cpp-python/lm-kit/bge-m3-gguf/*F16.gguf@4096"
if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 4 # noqa: PLR2004
else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf"
else "llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096"
)
)
embedder_normalize: bool = True
Expand Down
4 changes: 4 additions & 0 deletions src/raglite/_litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ def llm(model: str, **kwargs: Any) -> Llama:
n_ctx=n_ctx,
n_gpu_layers=-1,
verbose=False,
# Workaround to enable long context embedding models [1].
# [1] https://github.com/abetlen/llama-cpp-python/issues/1762
n_batch=n_ctx if n_ctx > 0 else 2048,
n_ubatch=n_ctx if n_ctx > 0 else 2048,
**kwargs,
)
# Enable caching.
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def database(request: pytest.FixtureRequest) -> str:
scope="session",
params=[
pytest.param(
"llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf",
"llama-cpp-python/lm-kit/bge-m3-gguf/*Q4_K_M.gguf@4096",
id="bge_m3",
),
pytest.param(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ def test_reranker(
)
# Search for a query.
query = "What does it mean for two events to be simultaneous?"
chunk_ids, _ = hybrid_search(query, num_results=3, config=raglite_test_config)
chunk_ids, _ = hybrid_search(query, num_results=10, config=raglite_test_config)
# Retrieve the chunks.
chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
assert all(isinstance(chunk, Chunk) for chunk in chunks)
assert all(chunk_id == chunk.id for chunk_id, chunk in zip(chunk_ids, chunks, strict=True))
# Rerank the chunks given an inverted chunk order.
reranked_chunks = rerank_chunks(query, chunks[::-1], config=raglite_test_config)
if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
assert reranked_chunks[0] == chunks[0]
assert reranked_chunks[0] in chunks[:3]
# Test that we can also rerank given the chunk_ids only.
reranked_chunks = rerank_chunks(query, chunk_ids[::-1], config=raglite_test_config)
if reranker is not None and "text-embedding-3-small" not in raglite_test_config.embedder:
assert reranked_chunks[0] == chunks[0]
assert reranked_chunks[0] in chunks[:3]

0 comments on commit c0e4abc

Please sign in to comment.