Skip to content

Commit

Permalink
chore: mypy lint
Browse files Browse the repository at this point in the history
  • Loading branch information
jamescalam committed Nov 28, 2024
1 parent da7b26c commit 6eb71c1
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 40 deletions.
4 changes: 2 additions & 2 deletions semantic_router/encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from semantic_router.encoders.tfidf import TfidfEncoder
from semantic_router.encoders.vit import VitEncoder
from semantic_router.encoders.zure import AzureOpenAIEncoder
from semantic_router.schema import EncoderType
from semantic_router.schema import EncoderType, SparseEmbedding

__all__ = [
"AurelioSparseEncoder",
Expand Down Expand Up @@ -79,5 +79,5 @@ def __init__(self, type: str, name: Optional[str]):
else:
raise ValueError(f"Encoder type '{type}' not supported")

def __call__(self, texts: List[str]) -> List[List[float]]:
def __call__(self, texts: List[str]) -> List[List[float]] | List[SparseEmbedding]:
return self.model(texts)
10 changes: 3 additions & 7 deletions semantic_router/encoders/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,8 @@ def __call__(self, docs: List[str]) -> list[SparseEmbedding]:
else:
raise ValueError("No documents to encode.")

embeds = [[0.0] * len(self.idx_mapping)] * len(docs)
embeds = []
for i, output in enumerate(sparse_dicts):
indices = output["indices"]
values = output["values"]
for idx, val in zip(indices, values):
if idx in self.idx_mapping:
position = self.idx_mapping[idx]
embeds[i][position] = val
if isinstance(output, dict):
embeds.append(SparseEmbedding.from_pinecone_dict(output))
return embeds
4 changes: 3 additions & 1 deletion semantic_router/index/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from pydantic.v1 import BaseModel

from semantic_router.schema import ConfigParameter, Utterance
from semantic_router.schema import ConfigParameter, SparseEmbedding, Utterance
from semantic_router.route import Route
from semantic_router.utils.logger import logger

Expand Down Expand Up @@ -108,6 +108,7 @@ def query(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Search the index for the query_vector and return top_k results.
Expand All @@ -120,6 +121,7 @@ async def aquery(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Search the index for the query_vector and return top_k results.
Expand Down
18 changes: 12 additions & 6 deletions semantic_router/index/hybrid_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
from numpy.linalg import norm

from semantic_router.schema import ConfigParameter, Utterance
from semantic_router.schema import ConfigParameter, SparseEmbedding, Utterance
from semantic_router.index.local import LocalIndex
from semantic_router.utils.logger import logger
from typing import Any
Expand Down Expand Up @@ -76,6 +76,8 @@ def _sparse_dot_product(
return sum(vec_a[i] * vec_b.get(i, 0) for i in vec_a)

def _sparse_index_dot_product(self, vec_a: dict[int, float]) -> list[float]:
if self.sparse_index is None:
raise ValueError("self.sparse_index is not populated.")
dot_products = [
self._sparse_dot_product(vec_a, vec_b) for vec_b in self.sparse_index
]
Expand All @@ -86,7 +88,7 @@ def query(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: Optional[dict[int, float]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""Search the index for the query and return top_k results.
Expand All @@ -103,9 +105,13 @@ def query(
raise ValueError("Route filter is not supported for HybridLocalIndex.")

xq_d = vector.copy()
if sparse_vector is None:
raise ValueError("Sparse vector is required for HybridLocalIndex.")
xq_s = sparse_vector.copy()
# align sparse vector type
if isinstance(sparse_vector, SparseEmbedding):
xq_s = sparse_vector.to_dict()
elif isinstance(sparse_vector, dict):
xq_s = sparse_vector
else:
raise ValueError("Sparse vector must be a SparseEmbedding or dict.")

if self.index is not None and self.sparse_index is not None:
# calculate dense vec similarity
Expand All @@ -130,7 +136,7 @@ async def aquery(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: Optional[dict[int, float]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""Search the index for the query and return top_k results. This method calls the
sync `query` method as everything uses numpy computations which is CPU-bound
Expand Down
4 changes: 3 additions & 1 deletion semantic_router/index/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from semantic_router.schema import ConfigParameter, Utterance
from semantic_router.schema import ConfigParameter, SparseEmbedding, Utterance
from semantic_router.index.base import BaseIndex
from semantic_router.linear import similarity_matrix, top_scores
from semantic_router.utils.logger import logger
Expand Down Expand Up @@ -68,6 +68,7 @@ def query(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Search the index for the query and return top_k results.
Expand Down Expand Up @@ -97,6 +98,7 @@ async def aquery(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Search the index for the query and return top_k results.
Expand Down
29 changes: 21 additions & 8 deletions semantic_router/index/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,19 +237,19 @@ def add(
embeddings: List[List[float]],
routes: List[str],
utterances: List[str],
function_schemas: Optional[List[Dict[str, Any]]] = None,
function_schemas: Optional[Optional[List[Dict[str, Any]]]] = None,
metadata_list: List[Dict[str, Any]] = [],
batch_size: int = 100,
sparse_embeddings: Optional[List[dict[int, float]]] = None,
sparse_embeddings: Optional[Optional[List[dict[int, float]]]] = None,
):
"""Add vectors to Pinecone in batches."""
if self.index is None:
self.dimensions = self.dimensions or len(embeddings[0])
self.index = self._init_index(force_create=True)
if function_schemas is None:
function_schemas = [None] * len(embeddings)
function_schemas = [{}] * len(embeddings)
if sparse_embeddings is None:
sparse_embeddings = [None] * len(embeddings)
sparse_embeddings = [{}] * len(embeddings)

vectors_to_upsert = [
PineconeRecord(
Expand All @@ -261,7 +261,12 @@ def add(
metadata=metadata,
).to_dict()
for vector, route, utterance, function_schema, metadata, sparse_dict in zip(
embeddings, routes, utterances, function_schemas, metadata_list, sparse_embeddings # type: ignore
embeddings,
routes,
utterances,
function_schemas,
metadata_list,
sparse_embeddings,
)
]

Expand Down Expand Up @@ -449,7 +454,7 @@ async def aquery(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
**kwargs: Any,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Asynchronously search the index for the query vector and return the top_k results.
Expand All @@ -475,9 +480,17 @@ async def aquery(
filter_query = {"sr_route": {"$in": route_filter}}
else:
filter_query = None
# set sparse_vector_obj
sparse_vector_obj: dict[str, Any] | None = None
if sparse_vector is not None:
if isinstance(sparse_vector, dict):
sparse_vector_obj = SparseEmbedding.from_dict(sparse_vector)
if isinstance(sparse_vector, SparseEmbedding):
# unnecessary if-statement but mypy didn't like this otherwise
sparse_vector_obj = sparse_vector.to_pinecone()
results = await self._async_query(
vector=query_vector_list,
sparse_vector=kwargs.get("sparse_vector", None),
sparse_vector=sparse_vector_obj,
namespace=self.namespace or "",
filter=filter_query,
top_k=top_k,
Expand Down Expand Up @@ -507,7 +520,7 @@ def delete_index(self):
async def _async_query(
self,
vector: list[float],
sparse_vector: Optional[dict] = None,
sparse_vector: dict[str, Any] | None = None,
namespace: str = "",
filter: Optional[dict] = None,
top_k: int = 5,
Expand Down
3 changes: 2 additions & 1 deletion semantic_router/index/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel

from semantic_router.index.base import BaseIndex
from semantic_router.schema import ConfigParameter, Metric
from semantic_router.schema import ConfigParameter, Metric, SparseEmbedding
from semantic_router.utils.logger import logger


Expand Down Expand Up @@ -340,6 +340,7 @@ def query(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
"""
Searches the index for the query vector and returns the top_k results.
Expand Down
4 changes: 3 additions & 1 deletion semantic_router/index/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pydantic.v1 import Field

from semantic_router.index.base import BaseIndex
from semantic_router.schema import ConfigParameter, Metric, Utterance
from semantic_router.schema import ConfigParameter, Metric, SparseEmbedding, Utterance
from semantic_router.utils.logger import logger

DEFAULT_COLLECTION_NAME = "semantic-router-index"
Expand Down Expand Up @@ -259,6 +259,7 @@ def query(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
from qdrant_client import QdrantClient, models

Expand Down Expand Up @@ -292,6 +293,7 @@ async def aquery(
vector: np.ndarray,
top_k: int = 5,
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> Tuple[np.ndarray, List[str]]:
from qdrant_client import AsyncQdrantClient, models

Expand Down
31 changes: 24 additions & 7 deletions semantic_router/routers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,15 @@ def _set_aggregation_method(self, aggregation: str = "sum"):
)

def _semantic_classify(self, query_results: List[Dict]) -> Tuple[str, List[float]]:
"""Classify the query results into a single class based on the highest total score.
If no classification is found, return an empty string and an empty list.
:param query_results: The query results to classify. Expected format is a list of
dictionaries with "route" and "score" keys.
:type query_results: List[Dict]
:return: A tuple containing the top class and its associated scores.
:rtype: Tuple[str, List[float]]
"""
scores_by_class = self.group_scores_by_class(query_results)

if self.aggregation_method is None:
Expand All @@ -1049,6 +1058,15 @@ def _semantic_classify(self, query_results: List[Dict]) -> Tuple[str, List[float
async def _async_semantic_classify(
self, query_results: List[Dict]
) -> Tuple[str, List[float]]:
"""Classify the query results into a single class based on the highest total score.
If no classification is found, return an empty string and an empty list.
:param query_results: The query results to classify. Expected format is a list of
dictionaries with "route" and "score" keys.
:type query_results: List[Dict]
:return: A tuple containing the top class and its associated scores.
:rtype: Tuple[str, List[float]]
"""
scores_by_class = await self.async_group_scores_by_class(query_results)

if self.aggregation_method is None:
Expand Down Expand Up @@ -1125,8 +1143,8 @@ async def async_group_scores_by_class(
return scores_by_class

def _pass_threshold(self, scores: List[float], threshold: float | None) -> bool:
"""Test if the route score passes the minimum threshold. If a threshold of None is
set, then the route will always pass no matter how low it scores.
"""Test if the route score passes the minimum threshold. A threshold of None defaults
to 0.0, so the route will always pass no matter how low it scores.
:param scores: The scores to test.
:type scores: List[float]
Expand Down Expand Up @@ -1168,9 +1186,9 @@ def set_threshold(self, threshold: float, route_name: str | None = None):
for route in self.routes:
route.score_threshold = threshold
else:
route = self.get(route_name)
if route is not None:
route.score_threshold = threshold
route_get: Route | None = self.get(route_name)
if route_get is not None:
route_get.score_threshold = threshold
else:
logger.error(f"Route `{route_name}` not found")

Expand All @@ -1190,9 +1208,8 @@ def to_yaml(self, file_path: str):
config.to_file(file_path)

def get_thresholds(self) -> Dict[str, float]:
# TODO: float() below is hacky fix for lint, fix this with new type?
thresholds = {
route.name: float(route.score_threshold or self.score_threshold)
route.name: route.score_threshold or self.score_threshold or 0.0
for route in self.routes
}
return thresholds
Expand Down
20 changes: 15 additions & 5 deletions semantic_router/routers/hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,24 +124,34 @@ def __call__(
route_filter: Optional[List[str]] = None,
sparse_vector: dict[int, float] | SparseEmbedding | None = None,
) -> RouteChoice:
vector_arr: np.ndarray | None = None
potential_sparse_vector: List[SparseEmbedding] | None = None
# if no vector provided, encode text to get vector
if vector is None:
if text is None:
raise ValueError("Either text or vector must be provided")
vector, potential_sparse_vector = self._encode(text=[text])
vector_arr, potential_sparse_vector = self._encode(text=[text])
if sparse_vector is None:
if text is None:
raise ValueError("Either text or sparse_vector must be provided")
sparse_vector = potential_sparse_vector
sparse_vector = (
potential_sparse_vector[0] if potential_sparse_vector else None
)
if sparse_vector is None:
raise ValueError("Sparse vector is required for HybridLocalIndex.")
vector_arr = vector_arr if vector_arr else np.array(vector)
# TODO: add alpha as a parameter
scores, route_names = self.index.query(
vector=np.array(vector) if isinstance(vector, list) else vector,
vector=vector_arr,
top_k=self.top_k,
route_filter=route_filter,
sparse_vector=sparse_vector[0],
sparse_vector=sparse_vector,
)
top_class, top_class_scores = self._semantic_classify(
list(zip(scores, route_names))
[
{"score": score, "route": route}
for score, route in zip(scores, route_names)
]
)
passed = self._pass_threshold(top_class_scores, self.score_threshold)
if passed:
Expand Down
5 changes: 5 additions & 0 deletions semantic_router/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,11 @@ def from_dict(cls, sparse_dict: dict):
arr = np.array([list(sparse_dict.keys()), list(sparse_dict.values())]).T
return cls.from_compact_array(arr)

@classmethod
def from_pinecone_dict(cls, sparse_dict: dict):
arr = np.array([sparse_dict["indices"], sparse_dict["values"]]).T
return cls.from_compact_array(arr)

def to_dict(self):
return {
i: v for i, v in zip(self.embedding[:, 0].astype(int), self.embedding[:, 1])
Expand Down
10 changes: 9 additions & 1 deletion tests/unit/test_hybrid_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,15 @@ def routes():


sparse_encoder = BM25Encoder(use_default_params=False)
sparse_encoder.fit(["The quick brown fox", "jumps over the lazy dog", "Hello, world!"])
sparse_encoder.fit(
[
Route(
name="Route 1",
utterances=["The quick brown fox", "jumps over the lazy dog"],
),
Route(name="Route 2", utterances=["Hello, world!"]),
]
)


class TestHybridRouter:
Expand Down

0 comments on commit 6eb71c1

Please sign in to comment.