Skip to content

Commit

Permalink
feat: Add annoy (#18)
Browse files Browse the repository at this point in the history
* Add annoy

* Fix test

* add dependency

* Add extras to makefile

* Add annoy backend, integrate with pynndescent

* remove stuff

* Fix generic type in pynndescent
  • Loading branch information
stephantul authored Nov 14, 2024
1 parent aaca428 commit 88cb61e
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 34 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install: venv
uv run pre-commit install

install-no-pre-commit:
uv pip install ".[dev,hnsw,pynndescent]"
uv pip install ".[dev,hnsw,pynndescent,annoy]"

install-base:
uv sync --extra dev
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pynndescent = [
"llvmlite>=0.42.0",
"numpy>=1.24.0"
]
annoy = ["annoy"]

[project.urls]
"Homepage" = "https://github.com/MinishLab"
Expand Down
21 changes: 9 additions & 12 deletions tests/test_vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_vicinity_init(backend_type: Backend, items: list[str], vectors: np.ndar
"""
Test Vicinity.init.
:param backend_type: The backend type to use (BASIC or HNSW).
:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param items: A list of item names.
:param vectors: An array of vectors.
"""
Expand All @@ -32,7 +32,7 @@ def test_vicinity_from_vectors_and_items(backend_type: Backend, items: list[str]
"""
Test Vicinity.from_vectors_and_items.
:param backend_type: The backend type to use (BASIC or HNSW).
:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param items: A list of item names.
:param vectors: An array of vectors.
"""
Expand Down Expand Up @@ -67,31 +67,28 @@ def test_vicinity_query_threshold(vicinity_instance: Vicinity, query_vector: np.
assert len(results) >= 1


def test_vicinity_insert(backend_type: Backend, vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
def test_vicinity_insert(vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
"""
Test Vicinity.insert method.
:param backend_type: The backend type to use.
:param vicinity_instance: A Vicinity instance.
:param query_vector: A query vector.
"""
if backend_type == Backend.HNSW or backend_type == Backend.PYNNDESCENT:
# Don't test insert for HNSW backend and PyNNDescent backend
if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test insert for HNSW or Annoy backend.
return
new_item = ["item101"]
new_vector = query_vector

vicinity_instance.insert(new_item, new_vector[None, :])

results = vicinity_instance.query(query_vector, k=1)
results = vicinity_instance.query(query_vector, k=10)
returned_item = results[0][0][0]

assert returned_item == "item101"


def test_vicinity_delete(
backend_type: Backend, vicinity_instance: Vicinity, items: list[str], vectors: np.ndarray
) -> None:
def test_vicinity_delete(vicinity_instance: Vicinity, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.delete method by verifying that the vector for a deleted item is not returned in subsequent queries.
Expand All @@ -100,8 +97,8 @@ def test_vicinity_delete(
:param items: List of item names.
:param vectors: Array of vectors corresponding to items.
"""
if backend_type == Backend.PYNNDESCENT:
# Don't test delete for PyNNDescent backend
if vicinity_instance.backend.backend_type in {Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test delete for Annoy and Pynndescent backend
return

# Get the vector corresponding to "item2"
Expand Down
13 changes: 13 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions vicinity/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ def get_backend_class(backend: Backend | str) -> type[AbstractBackend]:
from vicinity.backends.hnsw import HNSWBackend

return HNSWBackend
elif backend == Backend.ANNOY:
from vicinity.backends.annoy import AnnoyBackend

return AnnoyBackend
elif backend == Backend.PYNNDESCENT:
from vicinity.backends.pynndescent import PyNNDescentBackend

Expand Down
128 changes: 128 additions & 0 deletions vicinity/backends/annoy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Literal

import numpy as np
from annoy import AnnoyIndex
from numpy import typing as npt

from vicinity.backends.base import AbstractBackend, BaseArgs
from vicinity.datatypes import Backend, QueryResult
from vicinity.utils import normalize


@dataclass
class AnnoyArgs(BaseArgs):
dim: int = 0
metric: Literal["dot", "euclidean", "cosine"] = "cosine"
trees: int = 100
length: int | None = None


class AnnoyBackend(AbstractBackend[AnnoyArgs]):
argument_class = AnnoyArgs

def __init__(
self,
index: AnnoyIndex,
arguments: AnnoyArgs,
) -> None:
"""Initialize the backend using vectors."""
super().__init__(arguments)
self.index = index
if arguments.length is None:
raise ValueError("Length must be provided.")
self.length = arguments.length

@classmethod
def from_vectors(
cls: type[AnnoyBackend],
vectors: npt.NDArray,
metric: Literal["dot", "euclidean", "cosine"],
trees: int,
**kwargs: Any,
) -> AnnoyBackend:
"""Create a new instance from vectors."""
dim = vectors.shape[1]
actual_metric: Literal["dot", "euclidean"]
if metric == "cosine":
actual_metric = "dot"
vectors = normalize(vectors)
else:
actual_metric = metric

index = AnnoyIndex(f=dim, metric=actual_metric)
for i, vector in enumerate(vectors):
index.add_item(i, vector)
index.build(trees)

arguments = AnnoyArgs(dim=dim, trees=trees, metric=metric, length=len(vectors))
return AnnoyBackend(index, arguments=arguments)

@property
def backend_type(self) -> Backend:
"""The type of the backend."""
return Backend.ANNOY

@property
def dim(self) -> int:
"""Get the dimension of the space."""
return self.index.f

def __len__(self) -> int:
"""Get the number of vectors."""
return self.length

@classmethod
def load(cls: type[AnnoyBackend], base_path: Path) -> AnnoyBackend:
"""Load the vectors from a path."""
path = Path(base_path) / "index.bin"
arguments = AnnoyArgs.load(base_path / "arguments.json")

metric = arguments.metric
actual_metric = "dot" if metric == "cosine" else metric

index = AnnoyIndex(arguments.dim, actual_metric)
index.load(str(path))

return cls(index, arguments=arguments)

def save(self, base_path: Path) -> None:
"""Save the vectors to a path."""
path = Path(base_path) / "index.bin"
self.index.save(str(path))
# NOTE: set the length before saving.
self.arguments.length = len(self)
self.arguments.dump(base_path / "arguments.json")

def query(self, vectors: npt.NDArray, k: int) -> QueryResult:
"""Query the backend."""
out = []
for vec in vectors:
if self.arguments.metric == "cosine":
vec = normalize(vec)
indices, scores = self.index.get_nns_by_vector(vec, k, include_distances=True)
scores_array = np.asarray(scores)
if self.arguments.metric == "cosine":
# Turn cosine similarity into cosine distance.
scores_array = 1 - scores_array
out.append((np.asarray(indices), scores_array))
return out

def insert(self, vectors: npt.NDArray) -> None:
"""Insert vectors into the backend."""
raise NotImplementedError("Annoy does not support insertion.")

def delete(self, indices: list[int]) -> None:
"""Delete vectors from the backend."""
raise NotImplementedError("Annoy does not support deletion.")

def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]:
"""Threshold the backend."""
out: list[npt.NDArray] = []
for x, y in self.query(vectors, 100):
out.append(x[y < threshold])

return out
8 changes: 4 additions & 4 deletions vicinity/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, TypeVar
from typing import Any, Generic, TypeVar

from numpy import typing as npt

from vicinity.datatypes import Backend, QueryResult


@dataclass(frozen=True)
@dataclass
class BaseArgs:
def dump(self, file: Path) -> None:
"""Dump the arguments to a file."""
Expand All @@ -32,8 +32,8 @@ def dict(self) -> dict[str, Any]:
ArgType = TypeVar("ArgType", bound=BaseArgs)


class AbstractBackend(ABC):
argument_class: type[BaseArgs]
class AbstractBackend(ABC, Generic[ArgType]):
argument_class: type[ArgType]

def __init__(self, arguments: ArgType, *args: Any, **kwargs: Any) -> None:
"""Initialize the backend with vectors."""
Expand Down
13 changes: 5 additions & 8 deletions vicinity/backends/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
from vicinity.utils import normalize, normalize_or_copy


@dataclass(frozen=True)
class BasicArgs(BaseArgs):
dim: int | None = None
@dataclass
class BasicArgs(BaseArgs): ...


class BasicBackend(AbstractBackend):
class BasicBackend(AbstractBackend[BasicArgs]):
argument_class = BasicArgs

def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None:
Expand All @@ -36,11 +35,9 @@ def backend_type(self) -> Backend:
return Backend.BASIC

@classmethod
def from_vectors(cls: type[BasicBackend], vectors: npt.NDArray, dim: int | None = None) -> BasicBackend:
def from_vectors(cls: type[BasicBackend], vectors: npt.NDArray, **kwargs: Any) -> BasicBackend:
"""Create a new instance from vectors."""
if dim is None:
dim = vectors.shape[1]
return cls(vectors, BasicArgs(dim=dim))
return cls(vectors, BasicArgs())

@classmethod
def load(cls: type[BasicBackend], folder: Path) -> BasicBackend:
Expand Down
13 changes: 6 additions & 7 deletions vicinity/backends/hnsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from dataclasses import dataclass
from pathlib import Path
from typing import Literal
from typing import Any, Literal

from hnswlib import Index as HnswIndex
from numpy import typing as npt
Expand All @@ -11,15 +11,15 @@
from vicinity.datatypes import Backend, QueryResult


@dataclass(frozen=True)
@dataclass
class HNSWArgs(BaseArgs):
dim: int | None = None
dim: int = 0
space: Literal["cosine", "l2"] = "cosine"
ef_construction: int = 200
m: int = 16


class HNSWBackend(AbstractBackend):
class HNSWBackend(AbstractBackend[HNSWArgs]):
argument_class = HNSWArgs

def __init__(
Expand All @@ -35,14 +35,13 @@ def __init__(
def from_vectors(
cls: type[HNSWBackend],
vectors: npt.NDArray,
dim: int | None,
space: Literal["cosine", "l2"],
ef_construction: int,
m: int,
**kwargs: Any,
) -> HNSWBackend:
"""Create a new instance from vectors."""
if dim is None:
dim = vectors.shape[1]
dim = vectors.shape[1]
index = HnswIndex(space=space, dim=dim)
index.init_index(max_elements=vectors.shape[0], ef_construction=ef_construction, M=m)
index.add_items(vectors)
Expand Down
4 changes: 2 additions & 2 deletions vicinity/backends/pynndescent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from vicinity.utils import normalize_or_copy


@dataclass(frozen=True)
@dataclass
class PyNNDescentArgs(BaseArgs):
n_neighbors: int = 15
metric: Literal[
Expand All @@ -23,7 +23,7 @@ class PyNNDescentArgs(BaseArgs):
] = "cosine"


class PyNNDescentBackend(AbstractBackend):
class PyNNDescentBackend(AbstractBackend[PyNNDescentArgs]):
argument_class = PyNNDescentArgs

def __init__(
Expand Down
1 change: 1 addition & 0 deletions vicinity/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@
class Backend(str, Enum):
HNSW = "hnsw"
BASIC = "basic"
ANNOY = "annoy"
PYNNDESCENT = "pynndescent"

0 comments on commit 88cb61e

Please sign in to comment.