diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 089f2793..5add0784 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,7 +1,7 @@ import numpy as np -from numpy.linalg import norm from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder +from semantic_router.linear import similarity_matrix, top_scores from semantic_router.schema import Decision @@ -63,13 +63,9 @@ def _query(self, text: str, top_k: int = 5): xq = np.squeeze(xq) # Reduce to 1d array. if self.index is not None: - index_norm = norm(self.index, axis=1) - xq_norm = norm(xq.T) - sim = np.dot(self.index, xq.T) / (index_norm * xq_norm) - # get indices of top_k records - top_k = min(top_k, sim.shape[0]) - idx = np.argpartition(sim, -top_k)[-top_k:] - scores = sim[idx] + # calculate similarity matrix + sim = similarity_matrix(xq, self.index) + scores, idx = top_scores(sim, top_k) # get the utterance categories (decision names) decisions = self.categories[idx] if self.categories is not None else [] return [ diff --git a/semantic_router/linear.py b/semantic_router/linear.py new file mode 100644 index 00000000..1c13262f --- /dev/null +++ b/semantic_router/linear.py @@ -0,0 +1,30 @@ +from typing import Tuple + +import numpy as np +from numpy.linalg import norm + + +def similarity_matrix(xq: np.ndarray, index: np.ndarray) -> np.ndarray: + """Compute the similarity scores between a query vector and a set of vectors. + + Args: + xq: A query vector (1d ndarray) + index: A set of vectors. + + Returns: + The similarity between the query vector and the set of vectors. + """ + + index_norm = norm(index, axis=1) + xq_norm = norm(xq.T) + sim = np.dot(index, xq.T) / (index_norm * xq_norm) + return sim + + +def top_scores(sim: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]: + # get indices of top_k records + top_k = min(top_k, sim.shape[0]) + idx = np.argpartition(sim, -top_k)[-top_k:] + scores = sim[idx] + + return scores, idx diff --git a/tests/functional/test_linear.py b/tests/functional/test_linear.py new file mode 100644 index 00000000..210de6d2 --- /dev/null +++ b/tests/functional/test_linear.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +from semantic_router.linear import similarity_matrix, top_scores + + +@pytest.fixture +def ident_vector(): + return np.identity(10)[0] + + +@pytest.fixture +def test_index(): + return np.array([[3, 0, 0], [2, 1, 0], [0, 1, 0]]) + + +def test_similarity_matrix__dimensionality(): + """Test that the similarity matrix is square.""" + xq = np.random.random((10,)) # 10-dimensional embedding vector + index = np.random.random((100, 10)) + S = similarity_matrix(xq, index) + assert S.shape == (100,) + + +def test_similarity_matrix__is_norm_max(ident_vector): + """ + Using identical vectors should yield a maximum similarity of 1 + """ + index = np.repeat(np.atleast_2d(ident_vector), 3, axis=0) + sim = similarity_matrix(ident_vector, index) + assert sim.max() == 1.0 + + +def test_similarity_matrix__is_norm_min(ident_vector): + """ + Using orthogonal vectors should yield a minimum similarity of 0 + """ + orth_v = np.roll(np.atleast_2d(ident_vector), 1) + index = np.repeat(orth_v, 3, axis=0) + sim = similarity_matrix(ident_vector, index) + assert sim.min() == 0.0 + + +def test_top_scores__is_sorted(test_index): + """ + Test that the top_scores function returns a sorted list of scores. + """ + + xq = test_index[0] # should have max similarity + + sim = similarity_matrix(xq, test_index) + _, idx = top_scores(sim, 3) + + # Scores and indexes should be sorted ascending + assert np.array_equal(idx, np.array([2, 1, 0])) + + +def test_top_scores__scores(test_index): + """ + Test that for a known vector and a known index, the top_scores function + returns exactly the expected scores. + """ + xq = test_index[0] # should have max similarity + + sim = similarity_matrix(xq, test_index) + scores, _ = top_scores(sim, 3) + + # Scores and indexes should be sorted ascending + assert np.allclose(scores, np.array([0.0, 0.89442719, 1.0])) diff --git a/tests/encoders/test_base.py b/tests/unit/encoders/test_base.py similarity index 100% rename from tests/encoders/test_base.py rename to tests/unit/encoders/test_base.py diff --git a/tests/encoders/test_cohere.py b/tests/unit/encoders/test_cohere.py similarity index 100% rename from tests/encoders/test_cohere.py rename to tests/unit/encoders/test_cohere.py diff --git a/tests/encoders/test_openai.py b/tests/unit/encoders/test_openai.py similarity index 100% rename from tests/encoders/test_openai.py rename to tests/unit/encoders/test_openai.py diff --git a/tests/test_layer.py b/tests/unit/test_layer.py similarity index 100% rename from tests/test_layer.py rename to tests/unit/test_layer.py diff --git a/tests/test_schema.py b/tests/unit/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/unit/test_schema.py