From 64a640955ecb3507415ee39b1444e692a8771a3a Mon Sep 17 00:00:00 2001 From: Aakash Thatte <84656834+sky-2002@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:22:09 +0530 Subject: [PATCH] Voyager Backend (#41) * Add voyager backend --- Makefile | 2 +- README.md | 7 +- pyproject.toml | 4 +- tests/conftest.py | 36 ++++++++-- vicinity/backends/__init__.py | 5 ++ vicinity/backends/voyager.py | 125 ++++++++++++++++++++++++++++++++++ vicinity/datatypes.py | 1 + 7 files changed, 171 insertions(+), 9 deletions(-) create mode 100644 vicinity/backends/voyager.py diff --git a/Makefile b/Makefile index 8f31896..05fa564 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ install: venv uv run pre-commit install install-no-pre-commit: - uv pip install ".[dev,hnsw,pynndescent,annoy,faiss,usearch]" + uv pip install ".[dev,hnsw,pynndescent,annoy,faiss,usearch,voyager]" install-base: uv sync --extra dev diff --git a/README.md b/README.md index f43b962..3110368 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ The following backends are supported: - `ivf_scalar`: Inverted file search with scalar quantizer. - `ivfpq`: Inverted file search with product quantizer. - `ivfpqr`: Inverted file search with product quantizer and refinement. - +- [VOYAGER](https://github.com/spotify/voyager): Voyager is a library for performing fast approximate nearest-neighbor searches on an in-memory collection of vectors. @@ -149,7 +149,9 @@ NOTE: the ANN backends do not support dynamic deletion. To delete items, you nee | | `connectivity` | Number of connections per node in the graph. | `16` | | | `expansion_add` | Number of candidates considered during graph construction. | `128` | | | `expansion_search` | Number of candidates considered during search. | `64` | - +| **VOYAGER** | `metric` | Similarity space to use (`cosine`, `l2`). | `"cosine"` | +| | `ef_construction` | The number of vectors that this index searches through when inserting a new vector into the index. | `200` | +| | `m` | The number of connections between nodes in the tree’s internal data structure. | `16` | ## Installation The following installation options are available: @@ -166,6 +168,7 @@ pip install vicinity[faiss] pip install vicinity[hnsw] pip install vicinity[pynndescent] pip install vicinity[usearch] +pip install vicinity[voyager] ``` ## License diff --git a/pyproject.toml b/pyproject.toml index 7fc0eb5..5473082 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ pynndescent = [ annoy = ["annoy"] faiss = ["faiss-cpu"] usearch = ["usearch"] +voyager = ["voyager"] all = [ "hnswlib", "pynndescent>=0.5.10", @@ -60,7 +61,8 @@ all = [ "numpy>=1.24.0", "annoy", "faiss-cpu", - "usearch" + "usearch", + "voyager" ] [project.urls] diff --git a/tests/conftest.py b/tests/conftest.py index e2bde56..e727015 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,17 @@ random_gen = np.random.default_rng(42) -_faiss_index_types = ["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"] +_faiss_index_types = [ + "flat", + "ivf", + "hnsw", + "lsh", + "scalar", + "pq", + "ivf_scalar", + "ivfpq", + "ivfpqr", +] @pytest.fixture(scope="session") @@ -35,11 +45,15 @@ def query_vector() -> np.ndarray: (Backend.ANNOY, None), (Backend.PYNNDESCENT, None), (Backend.USEARCH, None), + (Backend.VOYAGER, None), ] # Create human-readable ids for each backend type -BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS] +BACKEND_IDS = [ + f"{backend.name}-{index_type}" if index_type else backend.name + for backend, index_type in BACKEND_PARAMS +] @pytest.fixture(params=BACKEND_PARAMS) @@ -49,7 +63,9 @@ def backend_type(request: pytest.FixtureRequest) -> Backend: @pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS) -def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity: +def vicinity_instance( + request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray +) -> Vicinity: """Fixture providing a Vicinity instance for each backend type.""" backend_type, index_type = request.param # Handle FAISS backend with specific FAISS index types @@ -57,11 +73,21 @@ def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: if index_type in ("pq", "ivfpq", "ivfpqr"): # Use smaller values for pq indexes since the dataset is small return Vicinity.from_vectors_and_items( - vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4 + vectors, + items, + backend_type=backend_type, + index_type=index_type, + m=2, + nbits=4, ) else: return Vicinity.from_vectors_and_items( - vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32 + vectors, + items, + backend_type=backend_type, + index_type=index_type, + nlist=2, + nbits=32, ) return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type) diff --git a/vicinity/backends/__init__.py b/vicinity/backends/__init__.py index 2fe414b..5402e73 100644 --- a/vicinity/backends/__init__.py +++ b/vicinity/backends/__init__.py @@ -33,5 +33,10 @@ def get_backend_class(backend: Union[Backend, str]) -> type[AbstractBackend]: return UsearchBackend + elif backend == Backend.VOYAGER: + from vicinity.backends.voyager import VoyagerBackend + + return VoyagerBackend + __all__ = ["get_backend_class", "AbstractBackend"] diff --git a/vicinity/backends/voyager.py b/vicinity/backends/voyager.py new file mode 100644 index 0000000..3c73bfa --- /dev/null +++ b/vicinity/backends/voyager.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Union + +import numpy as np +from numpy import typing as npt +from voyager import Index, Space + +from vicinity.backends.base import AbstractBackend, BaseArgs +from vicinity.datatypes import Backend, QueryResult +from vicinity.utils import Metric, normalize + + +@dataclass +class VoyagerArgs(BaseArgs): + dim: int = 0 + metric: str = "cosine" + ef_construction: int = 200 + m: int = 16 + + +class VoyagerBackend(AbstractBackend[VoyagerArgs]): + argument_class = VoyagerArgs + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN} + inverse_metric_mapping = { + Metric.COSINE: "cosine", + Metric.EUCLIDEAN: "l2", + } + + metric_int_mapping = { + "l2": 0, + "cosine": 2, + } + + def __init__( + self, + index: Index, + arguments: VoyagerArgs, + ) -> None: + """Initialize the backend using vectors.""" + super().__init__(arguments) + self.index = index + + @classmethod + def from_vectors( + cls: type[VoyagerBackend], + vectors: npt.NDArray, + metric: Union[str, Metric], + ef_construction: int, + m: int, + **kwargs: Any, + ) -> VoyagerBackend: + """Create a new instance from vectors.""" + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError( + f"Metric '{metric_enum.value}' is not supported by VoyagerBackend." + ) + + metric = cls._map_metric_to_string(metric_enum) + dim = vectors.shape[1] + space = Space(value=cls.metric_int_mapping[metric]) + index = Index( + space=space, + num_dimensions=dim, + M=m, + ef_construction=ef_construction, + ) + index.add_items(vectors) + return cls( + index, + VoyagerArgs(dim=dim, metric=metric, ef_construction=ef_construction, m=m), + ) + + def query(self, query: npt.NDArray, k: int) -> QueryResult: + """Query the backend for the nearest neighbors.""" + indices, distances = self.index.query(query, k) + return list(zip(indices, distances)) + + @classmethod + def load(cls: type[VoyagerBackend], base_path: Path) -> VoyagerBackend: + """Load the vectors from a path.""" + path = Path(base_path) / "index.bin" + arguments = VoyagerArgs.load(base_path / "arguments.json") + index = Index.load(str(path)) + return cls(index, arguments=arguments) + + def save(self, base_path: Path) -> None: + """Save the vectors to a path.""" + path = Path(base_path) / "index.bin" + self.index.save(str(path)) + self.arguments.dump(base_path / "arguments.json") + + def insert(self, vectors: npt.NDArray) -> None: + """Insert vectors into the backend.""" + self.index.add_items(vectors) + + def delete(self, indices: list[int]) -> None: + """Delete vectors from the backend.""" + raise NotImplementedError("Deletion is not supported in Voyager backend.") + + def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]: + """Threshold the backend.""" + out: list[npt.NDArray] = [] + for x, y in self.query(vectors, len(self)): + out.append(x[y < threshold]) + + return out + + @property + def backend_type(self) -> Backend: + """The type of the backend.""" + return Backend.VOYAGER + + @property + def dim(self) -> int: + """Get the dimension of the space.""" + return self.index.num_dimensions + + def __len__(self) -> int: + """Get the number of vectors.""" + return self.index.num_elements diff --git a/vicinity/datatypes.py b/vicinity/datatypes.py index 81200fe..3f98b54 100644 --- a/vicinity/datatypes.py +++ b/vicinity/datatypes.py @@ -22,3 +22,4 @@ class Backend(str, Enum): PYNNDESCENT = "pynndescent" FAISS = "faiss" USEARCH = "usearch" + VOYAGER = "voyager"