From 30a50b4bea30cc5d3252997994a64ea24bb80482 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sun, 12 Nov 2023 07:10:15 -0800 Subject: [PATCH 01/16] restructure and testing different decision layer structures --- semantic_router/encoders/__init__.py | 5 - semantic_router/encoders/huggingface.py | 9 - semantic_router/layer.py | 246 +++++++++++++++++- semantic_router/matchers/__init__.py | 0 semantic_router/matchers/base.py | 18 ++ semantic_router/matchers/ranker_only.py | 1 + semantic_router/matchers/two_stage.py | 59 +++++ semantic_router/rankers/__init__.py | 0 semantic_router/rankers/base.py | 12 + semantic_router/rankers/cohere.py | 31 +++ semantic_router/retrievers/__init__.py | 5 + .../{encoders => retrievers}/base.py | 4 +- semantic_router/retrievers/bm25.py | 21 ++ .../{encoders => retrievers}/cohere.py | 10 +- semantic_router/retrievers/huggingface.py | 9 + .../{encoders => retrievers}/openai.py | 8 +- semantic_router/schema.py | 32 +-- 17 files changed, 420 insertions(+), 50 deletions(-) delete mode 100644 semantic_router/encoders/__init__.py delete mode 100644 semantic_router/encoders/huggingface.py create mode 100644 semantic_router/matchers/__init__.py create mode 100644 semantic_router/matchers/base.py create mode 100644 semantic_router/matchers/ranker_only.py create mode 100644 semantic_router/matchers/two_stage.py create mode 100644 semantic_router/rankers/__init__.py create mode 100644 semantic_router/rankers/base.py create mode 100644 semantic_router/rankers/cohere.py create mode 100644 semantic_router/retrievers/__init__.py rename semantic_router/{encoders => retrievers}/base.py (67%) create mode 100644 semantic_router/retrievers/bm25.py rename semantic_router/{encoders => retrievers}/cohere.py (71%) create mode 100644 semantic_router/retrievers/huggingface.py rename semantic_router/{encoders => retrievers}/openai.py (81%) diff --git a/semantic_router/encoders/__init__.py b/semantic_router/encoders/__init__.py deleted file mode 100644 index 3fc94815..00000000 --- a/semantic_router/encoders/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import BaseEncoder -from .cohere import CohereEncoder -from .openai import OpenAIEncoder - -__all__ = ["BaseEncoder", "CohereEncoder", "OpenAIEncoder"] diff --git a/semantic_router/encoders/huggingface.py b/semantic_router/encoders/huggingface.py deleted file mode 100644 index 258c5037..00000000 --- a/semantic_router/encoders/huggingface.py +++ /dev/null @@ -1,9 +0,0 @@ -from semantic_router.encoders import BaseEncoder - - -class HuggingFaceEncoder(BaseEncoder): - def __init__(self, name: str): - self.name = name - - def __call__(self, texts: list[str]) -> list[float]: - raise NotImplementedError diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 089f2793..12b6e80a 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,24 +1,124 @@ import numpy as np from numpy.linalg import norm -from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder +from semantic_router.retrievers import ( + BaseRetriever, + CohereRetriever, + OpenAIRetriever, + BM25Retriever +) +from semantic_router.rankers import BaseRanker +from semantic_router.matchers import BaseMatcher from semantic_router.schema import Decision +class MatcherDecisionLayer: + index: None + decision_arr: None + score_threshold: float + + def __init__(self, matcher: BaseMatcher, decisions: list[Decision] = []): + self.matcher = matcher + # if decisions list has been passed and we have retriever + # we initialize index now + if matcher.retriever and decisions: + # initialize index now + for decision in decisions: + self._add_decision(decision=decision) + + def __call__(self, text: str) -> str | None: + raise NotImplementedError + +class RankDecisionLayer: + def __init__(self, ranker: BaseRanker, decisions: list[Decision] = []): + self.ranker = ranker + # if decisions list has been passed, we initialize decision array + if decisions: + for decision in decisions: + self._add_decision(decision=decision) + + def __call__(self, text: str) -> str | None: + results = self._query(text) + top_class, top_class_scores = self._semantic_classify(results) + passed = self._pass_threshold(top_class_scores, self.score_threshold) + if passed: + return top_class + else: + return None + + def add(self, decision: Decision): + self._add_decision(decision.utterances) + + def _add_decision(self, decision: Decision): + # create decision categories array + if self.categories is None: + self.categories = np.array([decision.name] * len(decision.utterances)) + self.utterances = np.array(decision.utterances) + else: + str_arr = np.array([decision.name] * len(decision.utterances)) + self.categories = np.concatenate([self.categories, str_arr]) + self.utterances = np.concatenate([ + self.utterances, + np.array(decision.utterances) + ]) + + def _query(self, text: str, top_k: int = 5): + """Given some text, encodes and searches the index vector space to + retrieve the top_k most similar records. + """ + if self.categories: + self.rerank.top_n = top_k + idx, docs = self.ranker(query=text, docs=self.utterances) + # create scores based on rank + scores = [1/(i+1) for i in range(len(docs))] + # get the utterance categories (decision names) + decisions = self.categories[idx] if self.categories is not None else [] + return [ + {"decision": d, "score": s.item()} for d, s in zip(decisions, scores) + ] + else: + return [] + + def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float]]: + scores_by_class = {} + for result in query_results: + score = result["score"] + decision = result["decision"] + if decision in scores_by_class: + scores_by_class[decision].append(score) + else: + scores_by_class[decision] = [score] + + # Calculate total score for each class + total_scores = { + decision: sum(scores) for decision, scores in scores_by_class.items() + } + top_class = max(total_scores, key=lambda x: total_scores[x], default=None) + + # Return the top class and its associated scores + return str(top_class), scores_by_class.get(top_class, []) + + def _pass_threshold(self, scores: list[float], threshold: float) -> bool: + if scores: + return max(scores) > threshold + else: + return False + + class DecisionLayer: index = None categories = None - similarity_threshold = 0.82 + score_threshold = 0.82 - def __init__(self, encoder: BaseEncoder, decisions: list[Decision] = []): + def __init__(self, encoder: BaseRetriever, decisions: list[Decision] = []): self.encoder = encoder # decide on default threshold based on encoder - if isinstance(encoder, OpenAIEncoder): - self.similarity_threshold = 0.82 - elif isinstance(encoder, CohereEncoder): - self.similarity_threshold = 0.3 + if isinstance(encoder, OpenAIRetriever): + self.score_threshold = 0.82 + elif isinstance(encoder, CohereRetriever): + self.score_threshold = 0.3 else: - self.similarity_threshold = 0.82 + self.score_threshold = 0.82 # if decisions list has been passed, we initialize index now if decisions: # initialize index now @@ -28,7 +128,7 @@ def __init__(self, encoder: BaseEncoder, decisions: list[Decision] = []): def __call__(self, text: str) -> str | None: results = self._query(text) top_class, top_class_scores = self._semantic_classify(results) - passed = self._pass_threshold(top_class_scores, self.similarity_threshold) + passed = self._pass_threshold(top_class_scores, self.score_threshold) if passed: return top_class else: @@ -102,3 +202,131 @@ def _pass_threshold(self, scores: list[float], threshold: float) -> bool: return max(scores) > threshold else: return False + + +class HybridDecisionLayer: + index = None + categories = None + score_threshold = 0.82 + + def __init__( + self, + encoder: BaseRetriever, + decisions: list[Decision] = [], + alpha: float = 0.3 + ): + self.encoder = encoder + self.sparse_encoder = BM25Retriever() + # decide on default threshold based on encoder + if isinstance(encoder, OpenAIRetriever): + self.score_threshold = 0.82 + elif isinstance(encoder, CohereRetriever): + self.score_threshold = 0.3 + else: + self.score_threshold = 0.82 + # if decisions list has been passed, we initialize index now + if decisions: + # initialize index now + for decision in decisions: + self._add_decision(decision=decision) + + def __call__(self, text: str) -> str | None: + results = self._query(text) + top_class, top_class_scores = self._semantic_classify(results) + passed = self._pass_threshold(top_class_scores, self.score_threshold) + if passed: + return top_class + else: + return None + + def add(self, decision: Decision): + self._add_decision(decision=decision) + + def _add_decision(self, decision: Decision): + # create embeddings + dense_embeds = self.encoder(decision.utterances) + sparse_embeds = self.sparse_encoder(decision.utterances) + # concatenate vectors to create hybrid vecs + embeds = np.concatenate([ + dense_embeds, sparse_embeds + ], axis=1) + + # create decision array + if self.categories is None: + self.categories = np.array([decision.name] * len(embeds)) + self.utterances = np.array(decision.utterances) + else: + str_arr = np.array([decision.name] * len(embeds)) + self.categories = np.concatenate([self.categories, str_arr]) + self.utterances = np.concatenate([ + self.utterances, + np.array(decision.utterances) + ]) + # create utterance array (the index) + if self.index is None: + self.index = np.array(embeds) + else: + embed_arr = np.array(embeds) + self.index = np.concatenate([self.index, embed_arr]) + + def _query(self, text: str, top_k: int = 5): + """Given some text, encodes and searches the index vector space to + retrieve the top_k most similar records. + """ + # create dense query vector + xq_d = np.array(self.encoder([text])) + xq_d = np.squeeze(xq_d) # Reduce to 1d array. + # create sparse query vector + xq_s = np.array(self.sparse_encoder([text])) + xq_s = np.squeeze(xq_s) + # convex scaling + xq_d, xq_s = self._convex_scaling(xq_d, xq_s) + # concatenate to create single hybrid vec + xq = np.concatenate([xq_d, xq_s], axis=1) + + if self.index is not None: + index_norm = norm(self.index, axis=1) + xq_norm = norm(xq.T) + sim = np.dot(self.index, xq.T) / (index_norm * xq_norm) + # get indices of top_k records + top_k = min(top_k, sim.shape[0]) + idx = np.argpartition(sim, -top_k)[-top_k:] + scores = sim[idx] + # get the utterance categories (decision names) + decisions = self.categories[idx] if self.categories is not None else [] + return [ + {"decision": d, "score": s.item()} for d, s in zip(decisions, scores) + ] + else: + return [] + + def _convex_scaling(self, dense: list[float], sparse: list[float]): + # scale sparse and dense vecs + dense = dense * self.alpha + sparse = sparse * (1 - self.alpha) + return dense, sparse + + def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float]]: + scores_by_class = {} + for result in query_results: + score = result["score"] + decision = result["decision"] + if decision in scores_by_class: + scores_by_class[decision].append(score) + else: + scores_by_class[decision] = [score] + + # Calculate total score for each class + total_scores = { + decision: sum(scores) for decision, scores in scores_by_class.items() + } + top_class = max(total_scores, key=lambda x: total_scores[x], default=None) + + # Return the top class and its associated scores + return str(top_class), scores_by_class.get(top_class, []) + + def _pass_threshold(self, scores: list[float], threshold: float) -> bool: + if scores: + return max(scores) > threshold + else: + return False \ No newline at end of file diff --git a/semantic_router/matchers/__init__.py b/semantic_router/matchers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/semantic_router/matchers/base.py b/semantic_router/matchers/base.py new file mode 100644 index 00000000..fc42cbe8 --- /dev/null +++ b/semantic_router/matchers/base.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from semantic_router.retrievers import BaseRetriever +from semantic_router.rankers import BaseRanker +from semantic_router.schema import Decision + + +class BaseMatcher(BaseModel): + retriever: BaseRetriever | None + ranker: BaseRanker | None + top_k: int | None + top_n: int | None + + class Config: + arbitrary_types_allowed = True + + def __call__(self, query: str, decisions: list[Decision]) -> str: + raise NotImplementedError("Subclasses must implement this method") \ No newline at end of file diff --git a/semantic_router/matchers/ranker_only.py b/semantic_router/matchers/ranker_only.py new file mode 100644 index 00000000..08b7fe2e --- /dev/null +++ b/semantic_router/matchers/ranker_only.py @@ -0,0 +1 @@ +from semantic_router import rankers \ No newline at end of file diff --git a/semantic_router/matchers/two_stage.py b/semantic_router/matchers/two_stage.py new file mode 100644 index 00000000..6b570030 --- /dev/null +++ b/semantic_router/matchers/two_stage.py @@ -0,0 +1,59 @@ +import numpy as np + +from semantic_router.rankers import ( + BaseRanker, + CohereRanker +) +from semantic_router.retrievers import ( + BaseRetriever, + CohereRetriever +) +from semantic_router.matchers import BaseMatcher +from semantic_router.schema import Decision + + +class TwoStageMatcher(BaseMatcher): + def __init__( + self, + retriever: BaseRetriever | None, + ranker: BaseRanker | None, + top_k: int = 25, + top_n: int = 5 + ): + super().__init__( + retriever=retriever, ranker=ranker, top_k=top_k, top_n=top_n + ) + if retriever is None: + retriever = CohereRetriever( + name="embed-english-v3.0", + top_k=top_k + ) + if ranker is None: + ranker = CohereRanker( + name="rerank-english-v2.0", + top_n=top_n + ) + + def __call__(self, query: str, decisions: list[Decision]) -> str: + pass + + def add(self, decision: Decision): + self._add_decision(decision=decision) + + def _add_decision(self, decision: Decision): + # create embeddings for first stage + embeds = self.retriever(decision.utterances) + # create a decision array for decision categories + if self.categories is None: + self.categories = np.array([decision.name] * len(embeds)) + else: + str_arr = np.array([decision.name] * len(embeds)) + self.categories = np.concatenate([self.categories, str_arr]) + # create utterance array (the index) + if self.index is None: + self.index = np.array(embeds) + else: + embed_arr = np.array(embeds) + self.index = np.concatenate([self.index, embed_arr]) + + \ No newline at end of file diff --git a/semantic_router/rankers/__init__.py b/semantic_router/rankers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/semantic_router/rankers/base.py b/semantic_router/rankers/base.py new file mode 100644 index 00000000..5d326f33 --- /dev/null +++ b/semantic_router/rankers/base.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + + +class BaseRanker(BaseModel): + name: str + top_n: int = 5 + + class Config: + arbitrary_types_allowed = True + + def __call__(self, query: str, docs: list[str]) -> list[str]: + raise NotImplementedError("Subclasses must implement this method") diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py new file mode 100644 index 00000000..b703a960 --- /dev/null +++ b/semantic_router/rankers/cohere.py @@ -0,0 +1,31 @@ +import os + +import cohere + +from semantic_router.rankers import BaseReranker + + +class CohereRanker(BaseReranker): + client: cohere.Client | None + + def __init__( + self, name: str = "rerank-english-v2.0", + top_n: int = 5, + cohere_api_key: str | None = None + ): + super().__init__(name=name, top_n=top_n) + cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY") + if cohere_api_key is None: + raise ValueError("Cohere API key cannot be 'None'.") + self.client = cohere.Client(cohere_api_key) + + def __call__(self, query: str, docs: list[str]) -> list[str]: + # get top_n results + results = self.client.rerank( + query=query, documents=docs, top_n=self.top_n, + model=self.name + ) + # get indices of entries that are ranked highest by cohere + top_idx = [r.index for r in results] + top_docs = [docs[i] for i in top_idx] + return top_idx, top_docs \ No newline at end of file diff --git a/semantic_router/retrievers/__init__.py b/semantic_router/retrievers/__init__.py new file mode 100644 index 00000000..0fcaa6d2 --- /dev/null +++ b/semantic_router/retrievers/__init__.py @@ -0,0 +1,5 @@ +from .base import BaseRetriever +from .cohere import CohereRetriever +from .openai import OpenAIRetriever + +__all__ = ["BaseRetriever", "CohereRetriever", "OpenAIRetriever"] diff --git a/semantic_router/encoders/base.py b/semantic_router/retrievers/base.py similarity index 67% rename from semantic_router/encoders/base.py rename to semantic_router/retrievers/base.py index 4b5ca40d..4274e074 100644 --- a/semantic_router/encoders/base.py +++ b/semantic_router/retrievers/base.py @@ -1,11 +1,11 @@ from pydantic import BaseModel -class BaseEncoder(BaseModel): +class BaseRetriever(BaseModel): name: str class Config: arbitrary_types_allowed = True - def __call__(self, texts: list[str]) -> list[float]: + def __call__(self, docs: list[str]) -> list[float]: raise NotImplementedError("Subclasses must implement this method") diff --git a/semantic_router/retrievers/bm25.py b/semantic_router/retrievers/bm25.py new file mode 100644 index 00000000..2a68a3ff --- /dev/null +++ b/semantic_router/retrievers/bm25.py @@ -0,0 +1,21 @@ +import os + +from pinecone_text import BM25Encoder + +from semantic_router.retrievers import BaseRetriever + + +class BM25Retriever(BaseRetriever): + def __init__(self, name: str = "bm25"): + super().__init__(name=name) + self.model = BM25Encoder() + + def __call__(self, docs: list[str]) -> list[list[float]]: + if self.params is None: + raise ValueError("BM25 model not trained, must call `.fit` first.") + embeds = self.model.encode_doocuments(docs) + return embeds.embeddings + + def fit(self, docs: list[str]): + params = self.model.fit(docs) + self.model.set_params(**params) \ No newline at end of file diff --git a/semantic_router/encoders/cohere.py b/semantic_router/retrievers/cohere.py similarity index 71% rename from semantic_router/encoders/cohere.py rename to semantic_router/retrievers/cohere.py index 0ed2ecc0..d2334f91 100644 --- a/semantic_router/encoders/cohere.py +++ b/semantic_router/retrievers/cohere.py @@ -2,10 +2,10 @@ import cohere -from semantic_router.encoders import BaseEncoder +from semantic_router.retrievers import BaseRetriever -class CohereEncoder(BaseEncoder): +class CohereRetriever(BaseRetriever): client: cohere.Client | None def __init__( @@ -17,12 +17,12 @@ def __init__( raise ValueError("Cohere API key cannot be 'None'.") self.client = cohere.Client(cohere_api_key) - def __call__(self, texts: list[str]) -> list[list[float]]: + def __call__(self, docs: list[str]) -> list[list[float]]: if self.client is None: raise ValueError("Cohere client is not initialized.") - if len(texts) == 1: + if len(docs) == 1: input_type = "search_query" else: input_type = "search_document" - embeds = self.client.embed(texts, input_type=input_type, model=self.name) + embeds = self.client.embed(docs, input_type=input_type, model=self.name) return embeds.embeddings diff --git a/semantic_router/retrievers/huggingface.py b/semantic_router/retrievers/huggingface.py new file mode 100644 index 00000000..9c8f2f05 --- /dev/null +++ b/semantic_router/retrievers/huggingface.py @@ -0,0 +1,9 @@ +from semantic_router.retrievers import BaseRetriever + + +class HuggingFaceRetriever(BaseRetriever): + def __init__(self, name: str): + self.name = name + + def __call__(self, docs: list[str]) -> list[float]: + raise NotImplementedError diff --git a/semantic_router/encoders/openai.py b/semantic_router/retrievers/openai.py similarity index 81% rename from semantic_router/encoders/openai.py rename to semantic_router/retrievers/openai.py index 87feec4c..2dbfd880 100644 --- a/semantic_router/encoders/openai.py +++ b/semantic_router/retrievers/openai.py @@ -4,17 +4,17 @@ import openai from openai.error import RateLimitError -from semantic_router.encoders import BaseEncoder +from semantic_router.retrievers import BaseRetriever -class OpenAIEncoder(BaseEncoder): +class OpenAIRetriever(BaseRetriever): def __init__(self, name: str, openai_api_key: str | None = None): super().__init__(name=name) openai.api_key = openai_api_key or os.getenv("OPENAI_API_KEY") if openai.api_key is None: raise ValueError("OpenAI API key cannot be 'None'.") - def __call__(self, texts: list[str]) -> list[list[float]]: + def __call__(self, docs: list[str]) -> list[list[float]]: """Encode a list of texts using the OpenAI API. Returns a list of vector embeddings. """ @@ -22,7 +22,7 @@ def __call__(self, texts: list[str]) -> list[list[float]]: # exponential backoff in case of RateLimitError for j in range(5): try: - res = openai.Embedding.create(input=texts, engine=self.name) + res = openai.Embedding.create(input=docs, engine=self.name) if isinstance(res, dict) and "data" in res: break except RateLimitError: diff --git a/semantic_router/schema.py b/semantic_router/schema.py index 439f2322..ea0ad2cf 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -3,11 +3,11 @@ from pydantic import BaseModel from pydantic.dataclasses import dataclass -from semantic_router.encoders import ( - BaseEncoder, - CohereEncoder, - HuggingFaceEncoder, - OpenAIEncoder, +from semantic_router.retrievers import ( + BaseRetriever, + CohereRetriever, + HuggingFaceRetriever, + OpenAIRetriever, ) @@ -17,27 +17,27 @@ class Decision(BaseModel): description: str | None = None -class EncoderType(Enum): +class RetrieverType(Enum): HUGGINGFACE = "huggingface" OPENAI = "openai" COHERE = "cohere" @dataclass -class Encoder: - type: EncoderType +class Retriever: + type: RetrieverType name: str - model: BaseEncoder + model: BaseRetriever def __init__(self, type: str, name: str): - self.type = EncoderType(type) + self.type = RetrieverType(type) self.name = name - if self.type == EncoderType.HUGGINGFACE: - self.model = HuggingFaceEncoder(name) - elif self.type == EncoderType.OPENAI: - self.model = OpenAIEncoder(name) - elif self.type == EncoderType.COHERE: - self.model = CohereEncoder(name) + if self.type == RetrieverType.HUGGINGFACE: + self.model = HuggingFaceRetriever(name) + elif self.type == RetrieverType.OPENAI: + self.model = OpenAIRetriever(name) + elif self.type == RetrieverType.COHERE: + self.model = CohereRetriever(name) def __call__(self, texts: list[str]) -> list[float]: return self.model(texts) From 605fb0eea6b4a28e7302524ee31c5331b96ea167 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sun, 19 Nov 2023 19:03:43 -0800 Subject: [PATCH 02/16] cleanup --- pyproject.toml | 5 +- semantic_router/layer.py | 96 +------------------------ semantic_router/matchers/__init__.py | 0 semantic_router/matchers/base.py | 18 ----- semantic_router/matchers/ranker_only.py | 1 - semantic_router/matchers/two_stage.py | 59 --------------- 6 files changed, 4 insertions(+), 175 deletions(-) delete mode 100644 semantic_router/matchers/__init__.py delete mode 100644 semantic_router/matchers/base.py delete mode 100644 semantic_router/matchers/ranker_only.py delete mode 100644 semantic_router/matchers/two_stage.py diff --git a/pyproject.toml b/pyproject.toml index aa5f664c..b549ed8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,12 @@ [tool.poetry] name = "semantic-router" -version = "0.0.1" +version = "0.0.5" description = "Super fast semantic router for AI decision making" authors = [ "James Briggs ", "Siraj Aizlewood ", - "Simonas Jakubonis " + "Simonas Jakubonis ", + "Luca Mannini " ] readme = "README.md" diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 12b6e80a..ad27a4c1 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -7,104 +7,9 @@ OpenAIRetriever, BM25Retriever ) -from semantic_router.rankers import BaseRanker -from semantic_router.matchers import BaseMatcher from semantic_router.schema import Decision -class MatcherDecisionLayer: - index: None - decision_arr: None - score_threshold: float - - def __init__(self, matcher: BaseMatcher, decisions: list[Decision] = []): - self.matcher = matcher - # if decisions list has been passed and we have retriever - # we initialize index now - if matcher.retriever and decisions: - # initialize index now - for decision in decisions: - self._add_decision(decision=decision) - - def __call__(self, text: str) -> str | None: - raise NotImplementedError - -class RankDecisionLayer: - def __init__(self, ranker: BaseRanker, decisions: list[Decision] = []): - self.ranker = ranker - # if decisions list has been passed, we initialize decision array - if decisions: - for decision in decisions: - self._add_decision(decision=decision) - - def __call__(self, text: str) -> str | None: - results = self._query(text) - top_class, top_class_scores = self._semantic_classify(results) - passed = self._pass_threshold(top_class_scores, self.score_threshold) - if passed: - return top_class - else: - return None - - def add(self, decision: Decision): - self._add_decision(decision.utterances) - - def _add_decision(self, decision: Decision): - # create decision categories array - if self.categories is None: - self.categories = np.array([decision.name] * len(decision.utterances)) - self.utterances = np.array(decision.utterances) - else: - str_arr = np.array([decision.name] * len(decision.utterances)) - self.categories = np.concatenate([self.categories, str_arr]) - self.utterances = np.concatenate([ - self.utterances, - np.array(decision.utterances) - ]) - - def _query(self, text: str, top_k: int = 5): - """Given some text, encodes and searches the index vector space to - retrieve the top_k most similar records. - """ - if self.categories: - self.rerank.top_n = top_k - idx, docs = self.ranker(query=text, docs=self.utterances) - # create scores based on rank - scores = [1/(i+1) for i in range(len(docs))] - # get the utterance categories (decision names) - decisions = self.categories[idx] if self.categories is not None else [] - return [ - {"decision": d, "score": s.item()} for d, s in zip(decisions, scores) - ] - else: - return [] - - def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float]]: - scores_by_class = {} - for result in query_results: - score = result["score"] - decision = result["decision"] - if decision in scores_by_class: - scores_by_class[decision].append(score) - else: - scores_by_class[decision] = [score] - - # Calculate total score for each class - total_scores = { - decision: sum(scores) for decision, scores in scores_by_class.items() - } - top_class = max(total_scores, key=lambda x: total_scores[x], default=None) - - # Return the top class and its associated scores - return str(top_class), scores_by_class.get(top_class, []) - - def _pass_threshold(self, scores: list[float], threshold: float) -> bool: - if scores: - return max(scores) > threshold - else: - return False - - class DecisionLayer: index = None categories = None @@ -217,6 +122,7 @@ def __init__( ): self.encoder = encoder self.sparse_encoder = BM25Retriever() + self.alpha = alpha # decide on default threshold based on encoder if isinstance(encoder, OpenAIRetriever): self.score_threshold = 0.82 diff --git a/semantic_router/matchers/__init__.py b/semantic_router/matchers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/semantic_router/matchers/base.py b/semantic_router/matchers/base.py deleted file mode 100644 index fc42cbe8..00000000 --- a/semantic_router/matchers/base.py +++ /dev/null @@ -1,18 +0,0 @@ -from pydantic import BaseModel - -from semantic_router.retrievers import BaseRetriever -from semantic_router.rankers import BaseRanker -from semantic_router.schema import Decision - - -class BaseMatcher(BaseModel): - retriever: BaseRetriever | None - ranker: BaseRanker | None - top_k: int | None - top_n: int | None - - class Config: - arbitrary_types_allowed = True - - def __call__(self, query: str, decisions: list[Decision]) -> str: - raise NotImplementedError("Subclasses must implement this method") \ No newline at end of file diff --git a/semantic_router/matchers/ranker_only.py b/semantic_router/matchers/ranker_only.py deleted file mode 100644 index 08b7fe2e..00000000 --- a/semantic_router/matchers/ranker_only.py +++ /dev/null @@ -1 +0,0 @@ -from semantic_router import rankers \ No newline at end of file diff --git a/semantic_router/matchers/two_stage.py b/semantic_router/matchers/two_stage.py deleted file mode 100644 index 6b570030..00000000 --- a/semantic_router/matchers/two_stage.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np - -from semantic_router.rankers import ( - BaseRanker, - CohereRanker -) -from semantic_router.retrievers import ( - BaseRetriever, - CohereRetriever -) -from semantic_router.matchers import BaseMatcher -from semantic_router.schema import Decision - - -class TwoStageMatcher(BaseMatcher): - def __init__( - self, - retriever: BaseRetriever | None, - ranker: BaseRanker | None, - top_k: int = 25, - top_n: int = 5 - ): - super().__init__( - retriever=retriever, ranker=ranker, top_k=top_k, top_n=top_n - ) - if retriever is None: - retriever = CohereRetriever( - name="embed-english-v3.0", - top_k=top_k - ) - if ranker is None: - ranker = CohereRanker( - name="rerank-english-v2.0", - top_n=top_n - ) - - def __call__(self, query: str, decisions: list[Decision]) -> str: - pass - - def add(self, decision: Decision): - self._add_decision(decision=decision) - - def _add_decision(self, decision: Decision): - # create embeddings for first stage - embeds = self.retriever(decision.utterances) - # create a decision array for decision categories - if self.categories is None: - self.categories = np.array([decision.name] * len(embeds)) - else: - str_arr = np.array([decision.name] * len(embeds)) - self.categories = np.concatenate([self.categories, str_arr]) - # create utterance array (the index) - if self.index is None: - self.index = np.array(embeds) - else: - embed_arr = np.array(embeds) - self.index = np.concatenate([self.index, embed_arr]) - - \ No newline at end of file From 38812965cc9d4bd27fc363591188206df2c89a66 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:45:05 -0800 Subject: [PATCH 03/16] dev work for hybrid --- docs/examples/hybrid-layer.ipynb | 420 ++++++++++++++++++ poetry.lock | 247 +++++++++- pyproject.toml | 1 + semantic_router/__init__.py | 3 + semantic_router/encoders/__init__.py | 6 + .../{retrievers => encoders}/base.py | 2 +- semantic_router/encoders/bm25.py | 40 ++ .../{retrievers => encoders}/cohere.py | 4 +- .../{retrievers => encoders}/huggingface.py | 4 +- .../{retrievers => encoders}/openai.py | 4 +- semantic_router/encoders/tfidf.py | 37 ++ semantic_router/layer.py | 42 +- semantic_router/rankers/cohere.py | 4 +- semantic_router/retrievers/__init__.py | 5 - semantic_router/retrievers/bm25.py | 21 - semantic_router/schema.py | 28 +- 16 files changed, 800 insertions(+), 68 deletions(-) create mode 100644 docs/examples/hybrid-layer.ipynb create mode 100644 semantic_router/encoders/__init__.py rename semantic_router/{retrievers => encoders}/base.py (88%) create mode 100644 semantic_router/encoders/bm25.py rename semantic_router/{retrievers => encoders}/cohere.py (88%) rename semantic_router/{retrievers => encoders}/huggingface.py (61%) rename semantic_router/{retrievers => encoders}/openai.py (92%) create mode 100644 semantic_router/encoders/tfidf.py delete mode 100644 semantic_router/retrievers/__init__.py delete mode 100644 semantic_router/retrievers/bm25.py diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb new file mode 100644 index 00000000..589e13a1 --- /dev/null +++ b/docs/examples/hybrid-layer.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Semantic Router: Hybrid Layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Hybrid Layer in the Semantic Router library can improve decision making performance particularly for niche use-cases that contain specific terminology, such as finance or medical. It helps us provide more importance to decision making based on the keywords contained in our utterances and user queries." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Started" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by installing the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU semantic-router==0.0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by defining a dictionary mapping decisions to example phrases that should trigger those decisions." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"COHERE_API_KEY\"] = \"<>\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jamesbriggs/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } + ], + "source": [ + "from semantic_router.schema import Decision\n", + "\n", + "politics = Decision(\n", + " name=\"politics\",\n", + " utterances=[\n", + " \"isn't politics the best thing ever\",\n", + " \"why don't you tell me about your political opinions\",\n", + " \"don't you just love the president\" \"don't you just hate the president\",\n", + " \"they're going to destroy this country!\",\n", + " \"they will save the country!\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define another for good measure:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "chitchat = Decision(\n", + " name=\"chitchat\",\n", + " utterances=[\n", + " \"how's the weather today?\",\n", + " \"how are things going?\",\n", + " \"lovely weather today\",\n", + " \"the weather is horrendous\",\n", + " \"let's go to the chippy\",\n", + " ],\n", + ")\n", + "\n", + "decisions = [politics, chitchat]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we initialize our embedding model:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_router.encoders import CohereEncoder\n", + "from getpass import getpass\n", + "\n", + "os.environ[\"COHERE_API_KEY\"] = os.environ[\"COHERE_API_KEY\"] or getpass(\n", + " \"Enter Cohere API Key: \"\n", + ")\n", + "\n", + "encoder = CohereEncoder()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we define the `DecisionLayer`. When called, the decision layer will consume text (a query) and output the category (`Decision`) it belongs to — to initialize a `DecisionLayer` we need our `encoder` model and a list of `decisions`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2067848296 1405\n", + "2212344012 2520\n", + "3313717465 206\n", + "3076736765 769\n", + "1778150425 4131\n", + "2067848296 1405\n", + "202708381 770\n", + "2212344012 2520\n", + "3374841595 2375\n", + "2067848296 1405\n", + "3508911095 2067\n", + "3454774732 not in encoder.idx_mapping\n", + "2379717389 3565\n", + "298452803 4356\n", + "1063320047 3369\n", + "4186256544 713\n", + "1846246980 858\n", + "3897916792 643\n", + "575623047 1476\n", + "3897916792 643\n" + ] + }, + { + "ename": "ValueError", + "evalue": "all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 14\u001b[0m line \u001b[0;36m3\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msemantic_router\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlayer\u001b[39;00m \u001b[39mimport\u001b[39;00m HybridDecisionLayer\n\u001b[0;32m----> 3\u001b[0m dl \u001b[39m=\u001b[39m HybridDecisionLayer(encoder\u001b[39m=\u001b[39;49mencoder, decisions\u001b[39m=\u001b[39;49mdecisions)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:137\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__init__\u001b[0;34m(self, encoder, decisions, alpha)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[39mif\u001b[39;00m decisions:\n\u001b[1;32m 135\u001b[0m \u001b[39m# initialize index now\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[39mfor\u001b[39;00m decision \u001b[39min\u001b[39;00m decisions:\n\u001b[0;32m--> 137\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_add_decision(decision\u001b[39m=\u001b[39;49mdecision)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:156\u001b[0m, in \u001b[0;36mHybridDecisionLayer._add_decision\u001b[0;34m(self, decision)\u001b[0m\n\u001b[1;32m 154\u001b[0m sparse_embeds \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_encoder(decision\u001b[39m.\u001b[39mutterances)\n\u001b[1;32m 155\u001b[0m \u001b[39m# concatenate vectors to create hybrid vecs\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m embeds \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mconcatenate([\n\u001b[1;32m 157\u001b[0m dense_embeds, sparse_embeds\n\u001b[1;32m 158\u001b[0m ], axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 160\u001b[0m \u001b[39m# create decision array\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcategories \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", + "\u001b[0;31mValueError\u001b[0m: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)" + ] + } + ], + "source": [ + "from semantic_router.layer import HybridDecisionLayer\n", + "\n", + "dl = HybridDecisionLayer(encoder=encoder, decisions=decisions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"don't you love politics?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 3454774732 in encoder.idx_mapping:\n", + " print(\"yes\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_router.encoders import BM25Encoder\n", + "\n", + "encoder = BM25Encoder()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tests = [\"hello this is some text\", \"and more stuff\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx_list = encoder.model.get_params()['doc_freq']['indices']\n", + "idx_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_dicts = encoder.model.encode_documents(tests)\n", + "sparse_dicts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeds = [0.0] * len(encoder.idx_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for output in sparse_dicts:\n", + " indices = output[\"indices\"]\n", + " values = output[\"values\"]\n", + " for idx, val in zip(indices, values):\n", + " position = encoder.idx_mapping[idx]\n", + " embeds[position] = val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoder.idx_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoded_output = encoder(tests)\n", + "encoded_output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "sparse_vec = np.zeros(len(idx_list))\n", + "idx_position_dict = {idx: i for i, idx in enumerate(idx_list)}\n", + "\n", + "for output in encoded_output:\n", + " indices = output['indices']\n", + " values = output['values']\n", + " for idx, value in zip(indices, values):\n", + " if idx in idx_position_dict:\n", + " position = idx_position_dict[idx]\n", + " sparse_vec[position] = value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_vec.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can test it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"don't you love politics?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"how's the weather today?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both are classified accurately, what if we send a query that is unrelated to our existing `Decision` objects?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"I'm interested in learning about llama 2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we return `None` because no matches were identified." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "decision-layer", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index a2617c1c..f47ac40c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -763,6 +763,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] +[[package]] +name = "joblib" +version = "1.3.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, + {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, +] + [[package]] name = "jupyter-client" version = "8.6.0" @@ -819,6 +830,50 @@ files = [ [package.dependencies] traitlets = "*" +[[package]] +name = "mmh3" +version = "3.1.0" +description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions." +optional = false +python-versions = "*" +files = [ + {file = "mmh3-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ee043b1bac040b4324b8baee39df9fdca480a560a6d74f2eef66a5009a234e"}, + {file = "mmh3-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04ac865319e5b36148a4b6cdf27f8bda091c47c4ab7b355d7f353dfc2b8a3cce"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e751f5433417a21c2060b0efa1afc67cfbe29977c867336148c8edb086fae70"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdb863b89c1b34e3681d4a3b15d424734940eb8036f3457cb35ef34fb87a503c"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1230930fbf2faec4ddf5b76d0768ae73c102de173c301962bdd468177275adf9"}, + {file = "mmh3-3.1.0-cp310-cp310-win32.whl", hash = "sha256:b8ed7a2361718795a1b519a08d05f44947a20b27e202b53946561a00dde669c1"}, + {file = "mmh3-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:29e878e7467a000f34ab68c218ad7ad81312c0a94bc10df3c50a48bcad39dd83"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c271472325b70d64a4fbb1f2e964ca5b093ac10258e1390f8408890b065868fe"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0109320f7e0e262123ff4f1acd06acfbc8b3bf19cc13d98c0bc369264430aaeb"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524e29dfe66499695f9496edcfc96782d130aabd6ba12c50c72372163cc6f3ea"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66bdb06a03074e65e614da1aa199b1d16c90608bec9d8fc3faa81d887ffe93cc"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a4d471eb75df8320061ab3b8cbe11c970be9f116b01bc2222ebda9c0a777520"}, + {file = "mmh3-3.1.0-cp311-cp311-win32.whl", hash = "sha256:a886d9ce995a4bdfd7a600ddf61b9015cccbc73c50b898f8ff3c78af24384710"}, + {file = "mmh3-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:5edb5ac882c04aff8a2a18ae8b74a0c339ac9b83db9820d8456f518bb558e0d8"}, + {file = "mmh3-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:190fd10981fbd6c67e10ce3b56bcc021562c0df0fee2e2864347d64e65b1783a"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd781b115cf649811cfde76368c33d2e553b6f88bb41131c314f30d8e65e9d24"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f48bb0a867077acc1f548591ad49506389f36d18f36dccd10becf071e5cbdda4"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d0936a82438e340636a11b9a938378870fc1c7a139632dac09a9a9277351704"}, + {file = "mmh3-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:d196cc035c2238493248522ae4e54c3cb790549b1564f6dea4d88dfe4b326313"}, + {file = "mmh3-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:731d37f089b6c212fab1beea24e673161146eb6c76baf9ac074a3424d1172d41"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9977fb81f8c66f4eee8439734a18dba7826fe78723d15ab53f42db977005be0f"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bf4f3f20a8b8405c08b13bc9e4ac33bf55129b50b535cd07ce1891b7f96326ac"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87cdbc6e70099ad92f17a28b4054ffb1938657e8fb7c1e4e03b194a1b4683fd6"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6dd81321d14f62aa3711f30533c85a74dc7596e0fee63c8eddd375bc92ab846c"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e6eba88e5c1a2778f3de00a9502e3c214ebb757337ece2a7d71e060d188ddfa"}, + {file = "mmh3-3.1.0-cp38-cp38-win32.whl", hash = "sha256:d91e696925f208d28f3bb7bdf29815524ce955248276af256519bd3538c411ce"}, + {file = "mmh3-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cbc2917df568aeb86ec5aa863bfb20fa14e01039cbdce7650efbabc30960df49"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b22832d565128be83d69f5d49243bb567840a954df377c9f5b26646a6eec39b"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ced92a0e285a9111413541c197b0c17d280cee96f7c564b258caf5de5ab8ee01"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f906833753b4ddcb690c2c1b74e77725868bc3a8b762b7a77737d08be89ae41d"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72b5685832a7a87a55ebff481794bc410484d7bd4c5e80dae4d8ac50739138ef"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d2aa4d422c7c088bbc5d367b45431268ebe6742a0a64eade93fab708e25757c"}, + {file = "mmh3-3.1.0-cp39-cp39-win32.whl", hash = "sha256:4459bec818f534dc8378568ad89ab310ff47cda3e00ab322edce48dd899bba32"}, + {file = "mmh3-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:03e04b3480e71828f48d17653451a3286555f0534942cb6ba93065b10ad5f9dc"}, + {file = "mmh3-3.1.0.tar.gz", hash = "sha256:9b0f2b2ab4a915333c9d1089572e290a021ebb5b900bb7f7114dccc03995d732"}, +] + [[package]] name = "multidict" version = "6.0.4" @@ -924,6 +979,65 @@ files = [ {file = "nest_asyncio-1.5.8.tar.gz", hash = "sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb"}, ] +[[package]] +name = "nltk" +version = "3.8.1" +description = "Natural Language Toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, + {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, +] + +[package.dependencies] +click = "*" +joblib = "*" +regex = ">=2021.8.3" +tqdm = "*" + +[package.extras] +all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"] +corenlp = ["requests"] +machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] +plot = ["matplotlib"] +tgrep = ["pyparsing"] +twitter = ["twython"] + +[[package]] +name = "numpy" +version = "1.25.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"}, + {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"}, + {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"}, + {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"}, + {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"}, + {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"}, + {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"}, + {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"}, + {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"}, + {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"}, + {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"}, + {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, +] + [[package]] name = "openai" version = "0.28.1" @@ -997,6 +1111,28 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "pinecone-text" +version = "0.7.0" +description = "Text utilities library by Pinecone.io" +optional = false +python-versions = ">=3.8,<4.0" +files = [ + {file = "pinecone_text-0.7.0-py3-none-any.whl", hash = "sha256:d20c7adc2259965a30fcbcf93a5eeb3f8d12babc9ea65ba858f1a6a5973d0737"}, + {file = "pinecone_text-0.7.0.tar.gz", hash = "sha256:8bda3c7337511dfb61da541299024ee73dbbed5d94e2af558a12357591b46174"}, +] + +[package.dependencies] +mmh3 = ">=3.1.0,<4.0.0" +nltk = ">=3.6.5,<4.0.0" +numpy = ">=1.21.5,<=1.25.2" +wget = ">=3.2,<4.0" + +[package.extras] +dense = ["openai (>=1.2.3,<2.0.0)", "sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] +openai = ["openai (>=1.2.3,<2.0.0)"] +splade = ["sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] + [[package]] name = "platformdirs" version = "4.0.0" @@ -1298,6 +1434,103 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "regex" +version = "2023.10.3" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.7" +files = [ + {file = "regex-2023.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4c34d4f73ea738223a094d8e0ffd6d2c1a1b4c175da34d6b0de3d8d69bee6bcc"}, + {file = "regex-2023.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8f4e49fc3ce020f65411432183e6775f24e02dff617281094ba6ab079ef0915"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cd1bccf99d3ef1ab6ba835308ad85be040e6a11b0977ef7ea8c8005f01a3c29"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81dce2ddc9f6e8f543d94b05d56e70d03a0774d32f6cca53e978dc01e4fc75b8"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c6b4d23c04831e3ab61717a707a5d763b300213db49ca680edf8bf13ab5d91b"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c15ad0aee158a15e17e0495e1e18741573d04eb6da06d8b84af726cfc1ed02ee"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6239d4e2e0b52c8bd38c51b760cd870069f0bdf99700a62cd509d7a031749a55"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4a8bf76e3182797c6b1afa5b822d1d5802ff30284abe4599e1247be4fd6b03be"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9c727bbcf0065cbb20f39d2b4f932f8fa1631c3e01fcedc979bd4f51fe051c5"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3ccf2716add72f80714b9a63899b67fa711b654be3fcdd34fa391d2d274ce767"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:107ac60d1bfdc3edb53be75e2a52aff7481b92817cfdddd9b4519ccf0e54a6ff"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:00ba3c9818e33f1fa974693fb55d24cdc8ebafcb2e4207680669d8f8d7cca79a"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f0a47efb1dbef13af9c9a54a94a0b814902e547b7f21acb29434504d18f36e3a"}, + {file = "regex-2023.10.3-cp310-cp310-win32.whl", hash = "sha256:36362386b813fa6c9146da6149a001b7bd063dabc4d49522a1f7aa65b725c7ec"}, + {file = "regex-2023.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:c65a3b5330b54103e7d21cac3f6bf3900d46f6d50138d73343d9e5b2900b2353"}, + {file = "regex-2023.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:90a79bce019c442604662d17bf69df99090e24cdc6ad95b18b6725c2988a490e"}, + {file = "regex-2023.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c7964c2183c3e6cce3f497e3a9f49d182e969f2dc3aeeadfa18945ff7bdd7051"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ef80829117a8061f974b2fda8ec799717242353bff55f8a29411794d635d964"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5addc9d0209a9afca5fc070f93b726bf7003bd63a427f65ef797a931782e7edc"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c148bec483cc4b421562b4bcedb8e28a3b84fcc8f0aa4418e10898f3c2c0eb9b"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d1f21af4c1539051049796a0f50aa342f9a27cde57318f2fc41ed50b0dbc4ac"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b9ac09853b2a3e0d0082104036579809679e7715671cfbf89d83c1cb2a30f58"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ebedc192abbc7fd13c5ee800e83a6df252bec691eb2c4bedc9f8b2e2903f5e2a"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d8a993c0a0ffd5f2d3bda23d0cd75e7086736f8f8268de8a82fbc4bd0ac6791e"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:be6b7b8d42d3090b6c80793524fa66c57ad7ee3fe9722b258aec6d0672543fd0"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4023e2efc35a30e66e938de5aef42b520c20e7eda7bb5fb12c35e5d09a4c43f6"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d47840dc05e0ba04fe2e26f15126de7c755496d5a8aae4a08bda4dd8d646c54"}, + {file = "regex-2023.10.3-cp311-cp311-win32.whl", hash = "sha256:9145f092b5d1977ec8c0ab46e7b3381b2fd069957b9862a43bd383e5c01d18c2"}, + {file = "regex-2023.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:b6104f9a46bd8743e4f738afef69b153c4b8b592d35ae46db07fc28ae3d5fb7c"}, + {file = "regex-2023.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:bff507ae210371d4b1fe316d03433ac099f184d570a1a611e541923f78f05037"}, + {file = "regex-2023.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be5e22bbb67924dea15039c3282fa4cc6cdfbe0cbbd1c0515f9223186fc2ec5f"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a992f702c9be9c72fa46f01ca6e18d131906a7180950958f766c2aa294d4b41"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7434a61b158be563c1362d9071358f8ab91b8d928728cd2882af060481244c9e"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2169b2dcabf4e608416f7f9468737583ce5f0a6e8677c4efbf795ce81109d7c"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9e908ef5889cda4de038892b9accc36d33d72fb3e12c747e2799a0e806ec841"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12bd4bc2c632742c7ce20db48e0d99afdc05e03f0b4c1af90542e05b809a03d9"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bc72c231f5449d86d6c7d9cc7cd819b6eb30134bb770b8cfdc0765e48ef9c420"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bce8814b076f0ce5766dc87d5a056b0e9437b8e0cd351b9a6c4e1134a7dfbda9"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ba7cd6dc4d585ea544c1412019921570ebd8a597fabf475acc4528210d7c4a6f"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b0c7d2f698e83f15228ba41c135501cfe7d5740181d5903e250e47f617eb4292"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5a8f91c64f390ecee09ff793319f30a0f32492e99f5dc1c72bc361f23ccd0a9a"}, + {file = "regex-2023.10.3-cp312-cp312-win32.whl", hash = "sha256:ad08a69728ff3c79866d729b095872afe1e0557251da4abb2c5faff15a91d19a"}, + {file = "regex-2023.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:39cdf8d141d6d44e8d5a12a8569d5a227f645c87df4f92179bd06e2e2705e76b"}, + {file = "regex-2023.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4a3ee019a9befe84fa3e917a2dd378807e423d013377a884c1970a3c2792d293"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76066d7ff61ba6bf3cb5efe2428fc82aac91802844c022d849a1f0f53820502d"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe50b61bab1b1ec260fa7cd91106fa9fece57e6beba05630afe27c71259c59b"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fd88f373cb71e6b59b7fa597e47e518282455c2734fd4306a05ca219a1991b0"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ab05a182c7937fb374f7e946f04fb23a0c0699c0450e9fb02ef567412d2fa3"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dac37cf08fcf2094159922edc7a2784cfcc5c70f8354469f79ed085f0328ebdf"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e54ddd0bb8fb626aa1f9ba7b36629564544954fff9669b15da3610c22b9a0991"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3367007ad1951fde612bf65b0dffc8fd681a4ab98ac86957d16491400d661302"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:16f8740eb6dbacc7113e3097b0a36065a02e37b47c936b551805d40340fb9971"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:f4f2ca6df64cbdd27f27b34f35adb640b5d2d77264228554e68deda54456eb11"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:39807cbcbe406efca2a233884e169d056c35aa7e9f343d4e78665246a332f597"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7eece6fbd3eae4a92d7c748ae825cbc1ee41a89bb1c3db05b5578ed3cfcfd7cb"}, + {file = "regex-2023.10.3-cp37-cp37m-win32.whl", hash = "sha256:ce615c92d90df8373d9e13acddd154152645c0dc060871abf6bd43809673d20a"}, + {file = "regex-2023.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f649fa32fe734c4abdfd4edbb8381c74abf5f34bc0b3271ce687b23729299ed"}, + {file = "regex-2023.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9b98b7681a9437262947f41c7fac567c7e1f6eddd94b0483596d320092004533"}, + {file = "regex-2023.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:91dc1d531f80c862441d7b66c4505cd6ea9d312f01fb2f4654f40c6fdf5cc37a"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82fcc1f1cc3ff1ab8a57ba619b149b907072e750815c5ba63e7aa2e1163384a4"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7979b834ec7a33aafae34a90aad9f914c41fd6eaa8474e66953f3f6f7cbd4368"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef71561f82a89af6cfcbee47f0fabfdb6e63788a9258e913955d89fdd96902ab"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd829712de97753367153ed84f2de752b86cd1f7a88b55a3a775eb52eafe8a94"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00e871d83a45eee2f8688d7e6849609c2ca2a04a6d48fba3dff4deef35d14f07"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:706e7b739fdd17cb89e1fbf712d9dc21311fc2333f6d435eac2d4ee81985098c"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cc3f1c053b73f20c7ad88b0d1d23be7e7b3901229ce89f5000a8399746a6e039"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6f85739e80d13644b981a88f529d79c5bdf646b460ba190bffcaf6d57b2a9863"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:741ba2f511cc9626b7561a440f87d658aabb3d6b744a86a3c025f866b4d19e7f"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e77c90ab5997e85901da85131fd36acd0ed2221368199b65f0d11bca44549711"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:979c24cbefaf2420c4e377ecd1f165ea08cc3d1fbb44bdc51bccbbf7c66a2cb4"}, + {file = "regex-2023.10.3-cp38-cp38-win32.whl", hash = "sha256:58837f9d221744d4c92d2cf7201c6acd19623b50c643b56992cbd2b745485d3d"}, + {file = "regex-2023.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:c55853684fe08d4897c37dfc5faeff70607a5f1806c8be148f1695be4a63414b"}, + {file = "regex-2023.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2c54e23836650bdf2c18222c87f6f840d4943944146ca479858404fedeb9f9af"}, + {file = "regex-2023.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69c0771ca5653c7d4b65203cbfc5e66db9375f1078689459fe196fe08b7b4930"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ac965a998e1388e6ff2e9781f499ad1eaa41e962a40d11c7823c9952c77123e"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c0e8fae5b27caa34177bdfa5a960c46ff2f78ee2d45c6db15ae3f64ecadde14"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6c56c3d47da04f921b73ff9415fbaa939f684d47293f071aa9cbb13c94afc17d"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ef1e014eed78ab650bef9a6a9cbe50b052c0aebe553fb2881e0453717573f52"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d29338556a59423d9ff7b6eb0cb89ead2b0875e08fe522f3e068b955c3e7b59b"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9c6d0ced3c06d0f183b73d3c5920727268d2201aa0fe6d55c60d68c792ff3588"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:994645a46c6a740ee8ce8df7911d4aee458d9b1bc5639bc968226763d07f00fa"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:66e2fe786ef28da2b28e222c89502b2af984858091675044d93cb50e6f46d7af"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:11175910f62b2b8c055f2b089e0fedd694fe2be3941b3e2633653bc51064c528"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:06e9abc0e4c9ab4779c74ad99c3fc10d3967d03114449acc2c2762ad4472b8ca"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fb02e4257376ae25c6dd95a5aec377f9b18c09be6ebdefa7ad209b9137b73d48"}, + {file = "regex-2023.10.3-cp39-cp39-win32.whl", hash = "sha256:3b2c3502603fab52d7619b882c25a6850b766ebd1b18de3df23b2f939360e1bd"}, + {file = "regex-2023.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:adbccd17dcaff65704c856bd29951c58a1bd4b2b0f8ad6b826dbd543fe740988"}, + {file = "regex-2023.10.3.tar.gz", hash = "sha256:3fef4f844d2290ee0ba57addcec17eec9e3df73f10a2748485dfd6a3a188cc0f"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -1479,6 +1712,16 @@ files = [ {file = "wcwidth-0.2.10.tar.gz", hash = "sha256:390c7454101092a6a5e43baad8f83de615463af459201709556b6e4b1c861f97"}, ] +[[package]] +name = "wget" +version = "3.2" +description = "pure python download utility" +optional = false +python-versions = "*" +files = [ + {file = "wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061"}, +] + [[package]] name = "yarl" version = "1.9.2" @@ -1584,4 +1827,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "7955e07ea098c2e8b29421733eb5ec6c06cbbc5bf64bd88451baa1a42c71e6b2" +content-hash = "4953e126f4e42186a812ca444ae887f723a5b76943234b7897df3bd8563944a3" diff --git a/pyproject.toml b/pyproject.toml index f45792aa..f2b4ba2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ python = "^3.10" pydantic = "^1.8.2" openai = "^0.28.1" cohere = "^4.32" +pinecone-text = "^0.7.0" [tool.poetry.group.dev.dependencies] diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py index e69de29b..ac1e314e 100644 --- a/semantic_router/__init__.py +++ b/semantic_router/__init__.py @@ -0,0 +1,3 @@ +from .layer import DecisionLayer, HybridDecisionLayer + +__all__ = ["DecisionLayer", "HybridDecisionLayer"] \ No newline at end of file diff --git a/semantic_router/encoders/__init__.py b/semantic_router/encoders/__init__.py new file mode 100644 index 00000000..0c86ce7c --- /dev/null +++ b/semantic_router/encoders/__init__.py @@ -0,0 +1,6 @@ +from .base import BaseEncoder +from .cohere import CohereEncoder +from .openai import OpenAIEncoder +from .bm25 import BM25Encoder + +__all__ = ["BaseEncoder", "CohereEncoder", "OpenAIEncoder", "BM25Encoder"] diff --git a/semantic_router/retrievers/base.py b/semantic_router/encoders/base.py similarity index 88% rename from semantic_router/retrievers/base.py rename to semantic_router/encoders/base.py index 4274e074..b6de1f89 100644 --- a/semantic_router/retrievers/base.py +++ b/semantic_router/encoders/base.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class BaseRetriever(BaseModel): +class BaseEncoder(BaseModel): name: str class Config: diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py new file mode 100644 index 00000000..344f0820 --- /dev/null +++ b/semantic_router/encoders/bm25.py @@ -0,0 +1,40 @@ +from pinecone_text.sparse import BM25Encoder as encoder + +from semantic_router.encoders import BaseEncoder + + +class BM25Encoder(BaseEncoder): + model: encoder | None = None + idx_mapping: dict[int, int] | None = None + + def __init__(self, name: str = "bm25"): + super().__init__(name=name) + # initialize BM25 encoder with default params (trained on MSMarco) + self.model = encoder.default() + self.idx_mapping = { + idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + } + + def __call__(self, docs: list[str]) -> list[list[float]]: + if len(docs) == 1: + sparse_dicts = self.model.encode_query(docs[0]) + elif len(docs) > 1: + sparse_dicts = self.model.encode_documents(docs) + else: + raise ValueError("No documents to encode.") + # convert sparse dict to sparse vector + embeds = [0.0] * len(self.idx_mapping) + for output in sparse_dicts: + indices = output["indices"] + values = output["values"] + for idx, val in zip(indices, values): + if idx in self.idx_mapping: + print(idx, self.idx_mapping[idx]) + position = self.idx_mapping[idx] + embeds[position] = val + else: + print(idx, "not in encoder.idx_mapping") + return embeds + + def fit(self, docs: list[str]): + self.model.fit(docs) diff --git a/semantic_router/retrievers/cohere.py b/semantic_router/encoders/cohere.py similarity index 88% rename from semantic_router/retrievers/cohere.py rename to semantic_router/encoders/cohere.py index 187cb744..fd20fa75 100644 --- a/semantic_router/retrievers/cohere.py +++ b/semantic_router/encoders/cohere.py @@ -2,10 +2,10 @@ import cohere -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class CohereRetriever(BaseRetriever): +class CohereEncoder(BaseEncoder): client: cohere.Client | None def __init__( diff --git a/semantic_router/retrievers/huggingface.py b/semantic_router/encoders/huggingface.py similarity index 61% rename from semantic_router/retrievers/huggingface.py rename to semantic_router/encoders/huggingface.py index 9c8f2f05..52ddecd2 100644 --- a/semantic_router/retrievers/huggingface.py +++ b/semantic_router/encoders/huggingface.py @@ -1,7 +1,7 @@ -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class HuggingFaceRetriever(BaseRetriever): +class HuggingFaceEncoder(BaseEncoder): def __init__(self, name: str): self.name = name diff --git a/semantic_router/retrievers/openai.py b/semantic_router/encoders/openai.py similarity index 92% rename from semantic_router/retrievers/openai.py rename to semantic_router/encoders/openai.py index 2dbfd880..5700c800 100644 --- a/semantic_router/retrievers/openai.py +++ b/semantic_router/encoders/openai.py @@ -4,10 +4,10 @@ import openai from openai.error import RateLimitError -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class OpenAIRetriever(BaseRetriever): +class OpenAIEncoder(BaseEncoder): def __init__(self, name: str, openai_api_key: str | None = None): super().__init__(name=name) openai.api_key = openai_api_key or os.getenv("OPENAI_API_KEY") diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py new file mode 100644 index 00000000..5dc7f34d --- /dev/null +++ b/semantic_router/encoders/tfidf.py @@ -0,0 +1,37 @@ +from functools import partial + +from sklearn.feature_extraction.text import TfidfVectorizer + +from semantic_router.encoders import BaseEncoder + + +class TfidfEncoder(BaseEncoder): + model: encoder | None = None + + def __init__(self, name: str = "bm25"): + super().__init__(name=name) + # initialize BM25 encoder with default params (trained on MSMarco) + self.model = encoder.default() + self.idx_mapping = { + idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + } + + def __call__(self, docs: list[str]) -> list[list[float]]: + if len(docs) == 1: + sparse_dicts = self.model.encode_query(docs[0]) + elif len(docs) > 1: + sparse_dicts = self.model.encode_documents(docs) + else: + raise ValueError("No documents to encode.") + # convert sparse dict to sparse vector + embeds = [0.0] * len(self.idx_mapping) + for output in sparse_dicts: + indices = output["indices"] + values = output["values"] + for idx, val in zip(indices, values): + position = self.idx_mapping[idx] + embeds[position] = val + return embeds + + def fit(self, docs: list[str]): + self.model.fit(docs) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index ad27a4c1..e8b71576 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,11 +1,11 @@ import numpy as np from numpy.linalg import norm -from semantic_router.retrievers import ( - BaseRetriever, - CohereRetriever, - OpenAIRetriever, - BM25Retriever +from semantic_router.encoders import ( + BaseEncoder, + CohereEncoder, + OpenAIEncoder, + BM25Encoder ) from semantic_router.schema import Decision @@ -15,12 +15,12 @@ class DecisionLayer: categories = None score_threshold = 0.82 - def __init__(self, encoder: BaseRetriever, decisions: list[Decision] = []): + def __init__(self, encoder: BaseEncoder, decisions: list[Decision] = []): self.encoder = encoder # decide on default threshold based on encoder - if isinstance(encoder, OpenAIRetriever): + if isinstance(encoder, OpenAIEncoder): self.score_threshold = 0.82 - elif isinstance(encoder, CohereRetriever): + elif isinstance(encoder, CohereEncoder): self.score_threshold = 0.3 else: self.score_threshold = 0.82 @@ -116,17 +116,17 @@ class HybridDecisionLayer: def __init__( self, - encoder: BaseRetriever, + encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3 ): self.encoder = encoder - self.sparse_encoder = BM25Retriever() + self.sparse_encoder = BM25Encoder() self.alpha = alpha # decide on default threshold based on encoder - if isinstance(encoder, OpenAIRetriever): + if isinstance(encoder, OpenAIEncoder): self.score_threshold = 0.82 - elif isinstance(encoder, CohereRetriever): + elif isinstance(encoder, CohereEncoder): self.score_threshold = 0.3 else: self.score_threshold = 0.82 @@ -150,8 +150,8 @@ def add(self, decision: Decision): def _add_decision(self, decision: Decision): # create embeddings - dense_embeds = self.encoder(decision.utterances) - sparse_embeds = self.sparse_encoder(decision.utterances) + dense_embeds = self.encoder(decision.utterances) * self.alpha + sparse_embeds = self.sparse_encoder(decision.utterances) * (1 - self.alpha) # concatenate vectors to create hybrid vecs embeds = np.concatenate([ dense_embeds, sparse_embeds @@ -168,12 +168,20 @@ def _add_decision(self, decision: Decision): self.utterances, np.array(decision.utterances) ]) - # create utterance array (the index) + # create utterance array (the dense index) if self.index is None: - self.index = np.array(embeds) + self.index = np.array(dense_embeds) else: - embed_arr = np.array(embeds) + embed_arr = np.array(dense_embeds) self.index = np.concatenate([self.index, embed_arr]) + # create sparse utterance array + if self.sparse_index is None: + self.sparse_index = np.array(sparse_embeds) + else: + sparse_embed_arr = np.array(sparse_embeds) + self.sparse_index = np.concatenate([ + self.sparse_index, sparse_embed_arr + ]) def _query(self, text: str, top_k: int = 5): """Given some text, encodes and searches the index vector space to diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py index b703a960..e79608b8 100644 --- a/semantic_router/rankers/cohere.py +++ b/semantic_router/rankers/cohere.py @@ -2,10 +2,10 @@ import cohere -from semantic_router.rankers import BaseReranker +from semantic_router.rankers import BaseRanker -class CohereRanker(BaseReranker): +class CohereRanker(BaseRanker): client: cohere.Client | None def __init__( diff --git a/semantic_router/retrievers/__init__.py b/semantic_router/retrievers/__init__.py deleted file mode 100644 index 0fcaa6d2..00000000 --- a/semantic_router/retrievers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import BaseRetriever -from .cohere import CohereRetriever -from .openai import OpenAIRetriever - -__all__ = ["BaseRetriever", "CohereRetriever", "OpenAIRetriever"] diff --git a/semantic_router/retrievers/bm25.py b/semantic_router/retrievers/bm25.py deleted file mode 100644 index 2a68a3ff..00000000 --- a/semantic_router/retrievers/bm25.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -from pinecone_text import BM25Encoder - -from semantic_router.retrievers import BaseRetriever - - -class BM25Retriever(BaseRetriever): - def __init__(self, name: str = "bm25"): - super().__init__(name=name) - self.model = BM25Encoder() - - def __call__(self, docs: list[str]) -> list[list[float]]: - if self.params is None: - raise ValueError("BM25 model not trained, must call `.fit` first.") - embeds = self.model.encode_doocuments(docs) - return embeds.embeddings - - def fit(self, docs: list[str]): - params = self.model.fit(docs) - self.model.set_params(**params) \ No newline at end of file diff --git a/semantic_router/schema.py b/semantic_router/schema.py index cb1288fb..37a43dd4 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -3,10 +3,10 @@ from pydantic import BaseModel from pydantic.dataclasses import dataclass -from semantic_router.retrievers import ( - BaseRetriever, - CohereRetriever, - OpenAIRetriever, +from semantic_router.encoders import ( + BaseEncoder, + CohereEncoder, + OpenAIEncoder, ) @@ -16,27 +16,27 @@ class Decision(BaseModel): description: str | None = None -class RetrieverType(Enum): +class EncoderType(Enum): HUGGINGFACE = "huggingface" OPENAI = "openai" COHERE = "cohere" @dataclass -class Retriever: - type: RetrieverType +class Encoder: + type: EncoderType name: str - model: BaseRetriever + model: BaseEncoder def __init__(self, type: str, name: str): - self.type = RetrieverType(type) + self.type = EncoderType(type) self.name = name - if self.type == RetrieverType.HUGGINGFACE: + if self.type == EncoderType.HUGGINGFACE: raise NotImplementedError - elif self.type == RetrieverType.OPENAI: - self.model = OpenAIRetriever(name) - elif self.type == RetrieverType.COHERE: - self.model = CohereRetriever(name) + elif self.type == EncoderType.OPENAI: + self.model = OpenAIEncoder(name) + elif self.type == EncoderType.COHERE: + self.model = CohereEncoder(name) def __call__(self, texts: list[str]) -> list[float]: return self.model(texts) From 9b34b58b91e6815778f4f6c80eb479fc348e1561 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Tue, 21 Nov 2023 20:14:59 -0800 Subject: [PATCH 04/16] further debugging --- docs/examples/hybrid-layer.ipynb | 77 ++++++++++++++------------------ semantic_router/encoders/bm25.py | 3 +- semantic_router/layer.py | 49 ++++++++++---------- 3 files changed, 61 insertions(+), 68 deletions(-) diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb index 589e13a1..d3fb58c5 100644 --- a/docs/examples/hybrid-layer.ipynb +++ b/docs/examples/hybrid-layer.ipynb @@ -151,39 +151,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2067848296 1405\n", - "2212344012 2520\n", - "3313717465 206\n", - "3076736765 769\n", - "1778150425 4131\n", - "2067848296 1405\n", - "202708381 770\n", - "2212344012 2520\n", - "3374841595 2375\n", - "2067848296 1405\n", - "3508911095 2067\n", - "3454774732 not in encoder.idx_mapping\n", - "2379717389 3565\n", - "298452803 4356\n", - "1063320047 3369\n", - "4186256544 713\n", - "1846246980 858\n", - "3897916792 643\n", - "575623047 1476\n", - "3897916792 643\n" - ] - }, - { - "ename": "ValueError", - "evalue": "all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 14\u001b[0m line \u001b[0;36m3\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msemantic_router\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlayer\u001b[39;00m \u001b[39mimport\u001b[39;00m HybridDecisionLayer\n\u001b[0;32m----> 3\u001b[0m dl \u001b[39m=\u001b[39m HybridDecisionLayer(encoder\u001b[39m=\u001b[39;49mencoder, decisions\u001b[39m=\u001b[39;49mdecisions)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:137\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__init__\u001b[0;34m(self, encoder, decisions, alpha)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[39mif\u001b[39;00m decisions:\n\u001b[1;32m 135\u001b[0m \u001b[39m# initialize index now\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[39mfor\u001b[39;00m decision \u001b[39min\u001b[39;00m decisions:\n\u001b[0;32m--> 137\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_add_decision(decision\u001b[39m=\u001b[39;49mdecision)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:156\u001b[0m, in \u001b[0;36mHybridDecisionLayer._add_decision\u001b[0;34m(self, decision)\u001b[0m\n\u001b[1;32m 154\u001b[0m sparse_embeds \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_encoder(decision\u001b[39m.\u001b[39mutterances)\n\u001b[1;32m 155\u001b[0m \u001b[39m# concatenate vectors to create hybrid vecs\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m embeds \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mconcatenate([\n\u001b[1;32m 157\u001b[0m dense_embeds, sparse_embeds\n\u001b[1;32m 158\u001b[0m ], axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 160\u001b[0m \u001b[39m# create decision array\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcategories \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "\u001b[0;31mValueError\u001b[0m: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)" + "3454774732 not in encoder.idx_mapping\n" ] } ], @@ -195,9 +163,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AxisError", + "evalue": "axis 1 is out of bounds for array of dimension 1", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAxisError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 15\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m dl(\u001b[39m\"\u001b[39;49m\u001b[39mdon\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mt you love politics?\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:141\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__call__\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 141\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_query(text)\n\u001b[1;32m 142\u001b[0m top_class, top_class_scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_semantic_classify(results)\n\u001b[1;32m 143\u001b[0m passed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pass_threshold(top_class_scores, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscore_threshold)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:204\u001b[0m, in \u001b[0;36mHybridDecisionLayer._query\u001b[0;34m(self, text, top_k)\u001b[0m\n\u001b[1;32m 202\u001b[0m sim_d \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, xq_d\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (index_norm \u001b[39m*\u001b[39m xq_d_norm)\n\u001b[1;32m 203\u001b[0m \u001b[39m# calculate sparse vec similarity\u001b[39;00m\n\u001b[0;32m--> 204\u001b[0m sparse_norm \u001b[39m=\u001b[39m norm(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msparse_index, axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 205\u001b[0m xq_s_norm \u001b[39m=\u001b[39m norm(xq_s\u001b[39m.\u001b[39mT)\n\u001b[1;32m 206\u001b[0m sim_s \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_index, xq_s\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (sparse_norm \u001b[39m*\u001b[39m xq_s_norm)\n", + "File \u001b[0;32m~/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/numpy/linalg/linalg.py:2583\u001b[0m, in \u001b[0;36mnorm\u001b[0;34m(x, ord, axis, keepdims)\u001b[0m\n\u001b[1;32m 2580\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[1;32m 2581\u001b[0m \u001b[39m# special case for speedup\u001b[39;00m\n\u001b[1;32m 2582\u001b[0m s \u001b[39m=\u001b[39m (x\u001b[39m.\u001b[39mconj() \u001b[39m*\u001b[39m x)\u001b[39m.\u001b[39mreal\n\u001b[0;32m-> 2583\u001b[0m \u001b[39mreturn\u001b[39;00m sqrt(add\u001b[39m.\u001b[39;49mreduce(s, axis\u001b[39m=\u001b[39;49maxis, keepdims\u001b[39m=\u001b[39;49mkeepdims))\n\u001b[1;32m 2584\u001b[0m \u001b[39m# None of the str-type keywords for ord ('fro', 'nuc')\u001b[39;00m\n\u001b[1;32m 2585\u001b[0m \u001b[39m# are valid for vectors\u001b[39;00m\n\u001b[1;32m 2586\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mord\u001b[39m, \u001b[39mstr\u001b[39m):\n", + "\u001b[0;31mAxisError\u001b[0m: axis 1 is out of bounds for array of dimension 1" + ] + } + ], "source": [ "dl(\"don't you love politics?\")" ] @@ -207,10 +190,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "if 3454774732 in encoder.idx_mapping:\n", - " print(\"yes\")" - ] + "source": [] }, { "cell_type": "code", @@ -220,11 +200,13 @@ "source": [] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "---\n", + "\n", + "#### Testing" + ] }, { "cell_type": "code", @@ -347,6 +329,15 @@ "sparse_vec.shape" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Finish Testing\n", + "\n", + "---" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py index 344f0820..c419ac8f 100644 --- a/semantic_router/encoders/bm25.py +++ b/semantic_router/encoders/bm25.py @@ -17,7 +17,7 @@ def __init__(self, name: str = "bm25"): def __call__(self, docs: list[str]) -> list[list[float]]: if len(docs) == 1: - sparse_dicts = self.model.encode_query(docs[0]) + sparse_dicts = self.model.encode_queries(docs) elif len(docs) > 1: sparse_dicts = self.model.encode_documents(docs) else: @@ -29,7 +29,6 @@ def __call__(self, docs: list[str]) -> list[list[float]]: values = output["values"] for idx, val in zip(indices, values): if idx in self.idx_mapping: - print(idx, self.idx_mapping[idx]) position = self.idx_mapping[idx] embeds[position] = val else: diff --git a/semantic_router/layer.py b/semantic_router/layer.py index e8b71576..070c9c0b 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -111,6 +111,7 @@ def _pass_threshold(self, scores: list[float], threshold: float) -> bool: class HybridDecisionLayer: index = None + sparse_index = None categories = None score_threshold = 0.82 @@ -150,19 +151,19 @@ def add(self, decision: Decision): def _add_decision(self, decision: Decision): # create embeddings - dense_embeds = self.encoder(decision.utterances) * self.alpha - sparse_embeds = self.sparse_encoder(decision.utterances) * (1 - self.alpha) - # concatenate vectors to create hybrid vecs - embeds = np.concatenate([ - dense_embeds, sparse_embeds - ], axis=1) + dense_embeds = np.array( + self.encoder(decision.utterances) + ) # * self.alpha + sparse_embeds = np.array( + self.sparse_encoder(decision.utterances) + ) # * (1 - self.alpha) # create decision array if self.categories is None: - self.categories = np.array([decision.name] * len(embeds)) + self.categories = np.array([decision.name] * len(decision.utterances)) self.utterances = np.array(decision.utterances) else: - str_arr = np.array([decision.name] * len(embeds)) + str_arr = np.array([decision.name] * len(decision.utterances)) self.categories = np.concatenate([self.categories, str_arr]) self.utterances = np.concatenate([ self.utterances, @@ -170,17 +171,15 @@ def _add_decision(self, decision: Decision): ]) # create utterance array (the dense index) if self.index is None: - self.index = np.array(dense_embeds) + self.index = dense_embeds else: - embed_arr = np.array(dense_embeds) - self.index = np.concatenate([self.index, embed_arr]) + self.index = np.concatenate([self.index, dense_embeds]) # create sparse utterance array if self.sparse_index is None: - self.sparse_index = np.array(sparse_embeds) + self.sparse_index = sparse_embeds else: - sparse_embed_arr = np.array(sparse_embeds) self.sparse_index = np.concatenate([ - self.sparse_index, sparse_embed_arr + self.sparse_index, sparse_embeds ]) def _query(self, text: str, top_k: int = 5): @@ -195,17 +194,21 @@ def _query(self, text: str, top_k: int = 5): xq_s = np.squeeze(xq_s) # convex scaling xq_d, xq_s = self._convex_scaling(xq_d, xq_s) - # concatenate to create single hybrid vec - xq = np.concatenate([xq_d, xq_s], axis=1) if self.index is not None: + # calculate dense vec similarity index_norm = norm(self.index, axis=1) - xq_norm = norm(xq.T) - sim = np.dot(self.index, xq.T) / (index_norm * xq_norm) + xq_d_norm = norm(xq_d.T) + sim_d = np.dot(self.index, xq_d.T) / (index_norm * xq_d_norm) + # calculate sparse vec similarity + sparse_norm = norm(self.sparse_index, axis=1) + xq_s_norm = norm(xq_s.T) + sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm) + total_sim = (sim_d + sim_s) # get indices of top_k records - top_k = min(top_k, sim.shape[0]) - idx = np.argpartition(sim, -top_k)[-top_k:] - scores = sim[idx] + top_k = min(top_k, total_sim.shape[0]) + idx = np.argpartition(total_sim, -top_k)[-top_k:] + scores = total_sim[idx] # get the utterance categories (decision names) decisions = self.categories[idx] if self.categories is not None else [] return [ @@ -216,8 +219,8 @@ def _query(self, text: str, top_k: int = 5): def _convex_scaling(self, dense: list[float], sparse: list[float]): # scale sparse and dense vecs - dense = dense * self.alpha - sparse = sparse * (1 - self.alpha) + dense = np.array(dense) * self.alpha + sparse = np.array(sparse) * (1 - self.alpha) return dense, sparse def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float]]: From b3fe444ae9e1b52e213040a95b42a3713eb43d55 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Fri, 1 Dec 2023 23:34:23 -0800 Subject: [PATCH 05/16] fixed bm25 encoder --- semantic_router/encoders/bm25.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py index c419ac8f..0f3985be 100644 --- a/semantic_router/encoders/bm25.py +++ b/semantic_router/encoders/bm25.py @@ -23,14 +23,14 @@ def __call__(self, docs: list[str]) -> list[list[float]]: else: raise ValueError("No documents to encode.") # convert sparse dict to sparse vector - embeds = [0.0] * len(self.idx_mapping) - for output in sparse_dicts: + embeds = [[0.0] * len(self.idx_mapping)] * len(docs) + for i, output in enumerate(sparse_dicts): indices = output["indices"] values = output["values"] for idx, val in zip(indices, values): if idx in self.idx_mapping: position = self.idx_mapping[idx] - embeds[position] = val + embeds[i][position] = val else: print(idx, "not in encoder.idx_mapping") return embeds From a57ed3f53942efc58d51ea23eb39557782cb6652 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 11:39:44 -0800 Subject: [PATCH 06/16] hybrid fixes --- semantic_router/layer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 070c9c0b..3bb685cc 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,5 +1,6 @@ import numpy as np from numpy.linalg import norm +from tqdm.auto import tqdm from semantic_router.encoders import ( BaseEncoder, @@ -27,7 +28,7 @@ def __init__(self, encoder: BaseEncoder, decisions: list[Decision] = []): # if decisions list has been passed, we initialize index now if decisions: # initialize index now - for decision in decisions: + for decision in tqdm(decisions): self._add_decision(decision=decision) def __call__(self, text: str) -> str | None: @@ -134,7 +135,7 @@ def __init__( # if decisions list has been passed, we initialize index now if decisions: # initialize index now - for decision in decisions: + for decision in tqdm(decisions): self._add_decision(decision=decision) def __call__(self, text: str) -> str | None: From 35b0c8efa3f087ef00d06e151701283306b48773 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 11:40:23 -0800 Subject: [PATCH 07/16] example notebook for hybrid --- docs/examples/hybrid-layer.ipynb | 248 +++++-------------------------- 1 file changed, 37 insertions(+), 211 deletions(-) diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb index d3fb58c5..fa6b78c7 100644 --- a/docs/examples/hybrid-layer.ipynb +++ b/docs/examples/hybrid-layer.ipynb @@ -52,7 +52,7 @@ "source": [ "import os\n", "\n", - "os.environ[\"COHERE_API_KEY\"] = \"<>\"" + "os.environ[\"COHERE_API_KEY\"] = \"<>\"" ] }, { @@ -78,7 +78,8 @@ " utterances=[\n", " \"isn't politics the best thing ever\",\n", " \"why don't you tell me about your political opinions\",\n", - " \"don't you just love the president\" \"don't you just hate the president\",\n", + " \"don't you just love the president\",\n", + " \"don't you just hate the president\",\n", " \"they're going to destroy this country!\",\n", " \"they will save the country!\",\n", " ],\n", @@ -98,6 +99,17 @@ "metadata": {}, "outputs": [], "source": [ + "chitchat = Decision(\n", + " name=\"chitchat\",\n", + " utterances=[\n", + " \"how's the weather today?\",\n", + " \"how are things going?\",\n", + " \"lovely weather today\",\n", + " \"the weather is horrendous\",\n", + " \"let's go to the chippy\",\n", + " ],\n", + ")\n", + "\n", "chitchat = Decision(\n", " name=\"chitchat\",\n", " utterances=[\n", @@ -146,15 +158,7 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3454774732 not in encoder.idx_mapping\n" - ] - } - ], + "outputs": [], "source": [ "from semantic_router.layer import HybridDecisionLayer\n", "\n", @@ -167,18 +171,14 @@ "metadata": {}, "outputs": [ { - "ename": "AxisError", - "evalue": "axis 1 is out of bounds for array of dimension 1", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAxisError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 15\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m dl(\u001b[39m\"\u001b[39;49m\u001b[39mdon\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mt you love politics?\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:141\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__call__\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 141\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_query(text)\n\u001b[1;32m 142\u001b[0m top_class, top_class_scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_semantic_classify(results)\n\u001b[1;32m 143\u001b[0m passed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pass_threshold(top_class_scores, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscore_threshold)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:204\u001b[0m, in \u001b[0;36mHybridDecisionLayer._query\u001b[0;34m(self, text, top_k)\u001b[0m\n\u001b[1;32m 202\u001b[0m sim_d \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, xq_d\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (index_norm \u001b[39m*\u001b[39m xq_d_norm)\n\u001b[1;32m 203\u001b[0m \u001b[39m# calculate sparse vec similarity\u001b[39;00m\n\u001b[0;32m--> 204\u001b[0m sparse_norm \u001b[39m=\u001b[39m norm(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msparse_index, axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 205\u001b[0m xq_s_norm \u001b[39m=\u001b[39m norm(xq_s\u001b[39m.\u001b[39mT)\n\u001b[1;32m 206\u001b[0m sim_s \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_index, xq_s\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (sparse_norm \u001b[39m*\u001b[39m xq_s_norm)\n", - "File \u001b[0;32m~/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/numpy/linalg/linalg.py:2583\u001b[0m, in \u001b[0;36mnorm\u001b[0;34m(x, ord, axis, keepdims)\u001b[0m\n\u001b[1;32m 2580\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[1;32m 2581\u001b[0m \u001b[39m# special case for speedup\u001b[39;00m\n\u001b[1;32m 2582\u001b[0m s \u001b[39m=\u001b[39m (x\u001b[39m.\u001b[39mconj() \u001b[39m*\u001b[39m x)\u001b[39m.\u001b[39mreal\n\u001b[0;32m-> 2583\u001b[0m \u001b[39mreturn\u001b[39;00m sqrt(add\u001b[39m.\u001b[39;49mreduce(s, axis\u001b[39m=\u001b[39;49maxis, keepdims\u001b[39m=\u001b[39;49mkeepdims))\n\u001b[1;32m 2584\u001b[0m \u001b[39m# None of the str-type keywords for ord ('fro', 'nuc')\u001b[39;00m\n\u001b[1;32m 2585\u001b[0m \u001b[39m# are valid for vectors\u001b[39;00m\n\u001b[1;32m 2586\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mord\u001b[39m, \u001b[39mstr\u001b[39m):\n", - "\u001b[0;31mAxisError\u001b[0m: axis 1 is out of bounds for array of dimension 1" - ] + "data": { + "text/plain": [ + "'politics'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -187,178 +187,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "#### Testing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from semantic_router.encoders import BM25Encoder\n", - "\n", - "encoder = BM25Encoder()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tests = [\"hello this is some text\", \"and more stuff\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx_list = encoder.model.get_params()['doc_freq']['indices']\n", - "idx_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_dicts = encoder.model.encode_documents(tests)\n", - "sparse_dicts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeds = [0.0] * len(encoder.idx_mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [ - "for output in sparse_dicts:\n", - " indices = output[\"indices\"]\n", - " values = output[\"values\"]\n", - " for idx, val in zip(indices, values):\n", - " position = encoder.idx_mapping[idx]\n", - " embeds[position] = val" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "encoder.idx_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "encoded_output = encoder(tests)\n", - "encoded_output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "\n", - "sparse_vec = np.zeros(len(idx_list))\n", - "idx_position_dict = {idx: i for i, idx in enumerate(idx_list)}\n", - "\n", - "for output in encoded_output:\n", - " indices = output['indices']\n", - " values = output['values']\n", - " for idx, value in zip(indices, values):\n", - " if idx in idx_position_dict:\n", - " position = idx_position_dict[idx]\n", - " sparse_vec[position] = value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_vec" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_vec.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Finish Testing\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can test it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dl(\"don't you love politics?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'chitchat'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dl(\"how's the weather today?\")" ] @@ -367,23 +209,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both are classified accurately, what if we send a query that is unrelated to our existing `Decision` objects?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dl(\"I'm interested in learning about llama 2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we return `None` because no matches were identified." + "---" ] } ], From e80eaa40b28c11b71bf79cc164e0ccaf3edb74c7 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 12:42:12 -0800 Subject: [PATCH 08/16] lint and del tfidf --- semantic_router/__init__.py | 2 +- semantic_router/encoders/bm25.py | 3 ++- semantic_router/encoders/tfidf.py | 37 ------------------------------- semantic_router/layer.py | 28 +++++++++-------------- semantic_router/rankers/cohere.py | 10 ++++----- 5 files changed, 18 insertions(+), 62 deletions(-) delete mode 100644 semantic_router/encoders/tfidf.py diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py index ac1e314e..734906f8 100644 --- a/semantic_router/__init__.py +++ b/semantic_router/__init__.py @@ -1,3 +1,3 @@ from .layer import DecisionLayer, HybridDecisionLayer -__all__ = ["DecisionLayer", "HybridDecisionLayer"] \ No newline at end of file +__all__ = ["DecisionLayer", "HybridDecisionLayer"] diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py index 0f3985be..0d498197 100644 --- a/semantic_router/encoders/bm25.py +++ b/semantic_router/encoders/bm25.py @@ -12,7 +12,8 @@ def __init__(self, name: str = "bm25"): # initialize BM25 encoder with default params (trained on MSMarco) self.model = encoder.default() self.idx_mapping = { - idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + idx: i + for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) } def __call__(self, docs: list[str]) -> list[list[float]]: diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py deleted file mode 100644 index 5dc7f34d..00000000 --- a/semantic_router/encoders/tfidf.py +++ /dev/null @@ -1,37 +0,0 @@ -from functools import partial - -from sklearn.feature_extraction.text import TfidfVectorizer - -from semantic_router.encoders import BaseEncoder - - -class TfidfEncoder(BaseEncoder): - model: encoder | None = None - - def __init__(self, name: str = "bm25"): - super().__init__(name=name) - # initialize BM25 encoder with default params (trained on MSMarco) - self.model = encoder.default() - self.idx_mapping = { - idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) - } - - def __call__(self, docs: list[str]) -> list[list[float]]: - if len(docs) == 1: - sparse_dicts = self.model.encode_query(docs[0]) - elif len(docs) > 1: - sparse_dicts = self.model.encode_documents(docs) - else: - raise ValueError("No documents to encode.") - # convert sparse dict to sparse vector - embeds = [0.0] * len(self.idx_mapping) - for output in sparse_dicts: - indices = output["indices"] - values = output["values"] - for idx, val in zip(indices, values): - position = self.idx_mapping[idx] - embeds[position] = val - return embeds - - def fit(self, docs: list[str]): - self.model.fit(docs) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 0c00e916..adff961c 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -6,7 +6,7 @@ BaseEncoder, CohereEncoder, OpenAIEncoder, - BM25Encoder + BM25Encoder, ) from semantic_router.linear import similarity_matrix, top_scores from semantic_router.schema import Decision @@ -114,10 +114,7 @@ class HybridDecisionLayer: score_threshold = 0.82 def __init__( - self, - encoder: BaseEncoder, - decisions: list[Decision] = [], - alpha: float = 0.3 + self, encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3 ): self.encoder = encoder self.sparse_encoder = BM25Encoder() @@ -149,9 +146,7 @@ def add(self, decision: Decision): def _add_decision(self, decision: Decision): # create embeddings - dense_embeds = np.array( - self.encoder(decision.utterances) - ) # * self.alpha + dense_embeds = np.array(self.encoder(decision.utterances)) # * self.alpha sparse_embeds = np.array( self.sparse_encoder(decision.utterances) ) # * (1 - self.alpha) @@ -163,10 +158,9 @@ def _add_decision(self, decision: Decision): else: str_arr = np.array([decision.name] * len(decision.utterances)) self.categories = np.concatenate([self.categories, str_arr]) - self.utterances = np.concatenate([ - self.utterances, - np.array(decision.utterances) - ]) + self.utterances = np.concatenate( + [self.utterances, np.array(decision.utterances)] + ) # create utterance array (the dense index) if self.index is None: self.index = dense_embeds @@ -176,9 +170,7 @@ def _add_decision(self, decision: Decision): if self.sparse_index is None: self.sparse_index = sparse_embeds else: - self.sparse_index = np.concatenate([ - self.sparse_index, sparse_embeds - ]) + self.sparse_index = np.concatenate([self.sparse_index, sparse_embeds]) def _query(self, text: str, top_k: int = 5): """Given some text, encodes and searches the index vector space to @@ -202,7 +194,7 @@ def _query(self, text: str, top_k: int = 5): sparse_norm = norm(self.sparse_index, axis=1) xq_s_norm = norm(xq_s.T) sim_s = np.dot(self.sparse_index, xq_s.T) / (sparse_norm * xq_s_norm) - total_sim = (sim_d + sim_s) + total_sim = sim_d + sim_s # get indices of top_k records top_k = min(top_k, total_sim.shape[0]) idx = np.argpartition(total_sim, -top_k)[-top_k:] @@ -214,7 +206,7 @@ def _query(self, text: str, top_k: int = 5): ] else: return [] - + def _convex_scaling(self, dense: list[float], sparse: list[float]): # scale sparse and dense vecs dense = np.array(dense) * self.alpha @@ -244,4 +236,4 @@ def _pass_threshold(self, scores: list[float], threshold: float) -> bool: if scores: return max(scores) > threshold else: - return False \ No newline at end of file + return False diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py index e79608b8..7e6e8ad6 100644 --- a/semantic_router/rankers/cohere.py +++ b/semantic_router/rankers/cohere.py @@ -9,9 +9,10 @@ class CohereRanker(BaseRanker): client: cohere.Client | None def __init__( - self, name: str = "rerank-english-v2.0", + self, + name: str = "rerank-english-v2.0", top_n: int = 5, - cohere_api_key: str | None = None + cohere_api_key: str | None = None, ): super().__init__(name=name, top_n=top_n) cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY") @@ -22,10 +23,9 @@ def __init__( def __call__(self, query: str, docs: list[str]) -> list[str]: # get top_n results results = self.client.rerank( - query=query, documents=docs, top_n=self.top_n, - model=self.name + query=query, documents=docs, top_n=self.top_n, model=self.name ) # get indices of entries that are ranked highest by cohere top_idx = [r.index for r in results] top_docs = [docs[i] for i in top_idx] - return top_idx, top_docs \ No newline at end of file + return top_idx, top_docs From c70d08616296b81ab3d8d4fad33bb746560bc30c Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 12:48:19 -0800 Subject: [PATCH 09/16] update tests to use score_threshold --- tests/unit/test_layer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 96e06a08..0b9842fb 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -45,16 +45,16 @@ def decisions(): class TestDecisionLayer: def test_initialization(self, openai_encoder, decisions): decision_layer = DecisionLayer(encoder=openai_encoder, decisions=decisions) - assert decision_layer.similarity_threshold == 0.82 + assert decision_layer.score_threshold == 0.82 assert len(decision_layer.index) == 5 assert len(set(decision_layer.categories)) == 2 def test_initialization_different_encoders(self, cohere_encoder, openai_encoder): decision_layer_cohere = DecisionLayer(encoder=cohere_encoder) - assert decision_layer_cohere.similarity_threshold == 0.3 + assert decision_layer_cohere.score_threshold == 0.3 decision_layer_openai = DecisionLayer(encoder=openai_encoder) - assert decision_layer_openai.similarity_threshold == 0.82 + assert decision_layer_openai.score_threshold == 0.82 def test_add_decision(self, openai_encoder): decision_layer = DecisionLayer(encoder=openai_encoder) @@ -107,9 +107,9 @@ def test_pass_threshold(self, openai_encoder): assert not decision_layer._pass_threshold([], 0.5) assert decision_layer._pass_threshold([0.6, 0.7], 0.5) - def test_failover_similarity_threshold(self, base_encoder): + def test_failover_score_threshold(self, base_encoder): decision_layer = DecisionLayer(encoder=base_encoder) - assert decision_layer.similarity_threshold == 0.82 + assert decision_layer.score_threshold == 0.82 # Add more tests for edge cases and error handling as needed. From 42dea8dd0b1fe5f331abe602b043f2bd97f0f543 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:03:55 -0800 Subject: [PATCH 10/16] update tests and removed WIP modules/objects --- semantic_router/encoders/huggingface.py | 9 ---- semantic_router/rankers/__init__.py | 0 semantic_router/rankers/base.py | 12 ----- semantic_router/rankers/cohere.py | 31 ----------- tests/unit/encoders/test_bm25.py | 21 ++++++++ tests/unit/test_layer.py | 70 ++++++++++++++++++++++++- 6 files changed, 90 insertions(+), 53 deletions(-) delete mode 100644 semantic_router/encoders/huggingface.py delete mode 100644 semantic_router/rankers/__init__.py delete mode 100644 semantic_router/rankers/base.py delete mode 100644 semantic_router/rankers/cohere.py create mode 100644 tests/unit/encoders/test_bm25.py diff --git a/semantic_router/encoders/huggingface.py b/semantic_router/encoders/huggingface.py deleted file mode 100644 index 52ddecd2..00000000 --- a/semantic_router/encoders/huggingface.py +++ /dev/null @@ -1,9 +0,0 @@ -from semantic_router.encoders import BaseEncoder - - -class HuggingFaceEncoder(BaseEncoder): - def __init__(self, name: str): - self.name = name - - def __call__(self, docs: list[str]) -> list[float]: - raise NotImplementedError diff --git a/semantic_router/rankers/__init__.py b/semantic_router/rankers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/semantic_router/rankers/base.py b/semantic_router/rankers/base.py deleted file mode 100644 index 5d326f33..00000000 --- a/semantic_router/rankers/base.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - - -class BaseRanker(BaseModel): - name: str - top_n: int = 5 - - class Config: - arbitrary_types_allowed = True - - def __call__(self, query: str, docs: list[str]) -> list[str]: - raise NotImplementedError("Subclasses must implement this method") diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py deleted file mode 100644 index 7e6e8ad6..00000000 --- a/semantic_router/rankers/cohere.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -import cohere - -from semantic_router.rankers import BaseRanker - - -class CohereRanker(BaseRanker): - client: cohere.Client | None - - def __init__( - self, - name: str = "rerank-english-v2.0", - top_n: int = 5, - cohere_api_key: str | None = None, - ): - super().__init__(name=name, top_n=top_n) - cohere_api_key = cohere_api_key or os.getenv("COHERE_API_KEY") - if cohere_api_key is None: - raise ValueError("Cohere API key cannot be 'None'.") - self.client = cohere.Client(cohere_api_key) - - def __call__(self, query: str, docs: list[str]) -> list[str]: - # get top_n results - results = self.client.rerank( - query=query, documents=docs, top_n=self.top_n, model=self.name - ) - # get indices of entries that are ranked highest by cohere - top_idx = [r.index for r in results] - top_docs = [docs[i] for i in top_idx] - return top_idx, top_docs diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py new file mode 100644 index 00000000..8c0e9bc4 --- /dev/null +++ b/tests/unit/encoders/test_bm25.py @@ -0,0 +1,21 @@ +import pytest + +from semantic_router.encoders import BM25Encoder + + +@pytest.fixture +def bm25_encoder(): + return BM25Encoder() + + +class TestBM25Encoder: + def test_initialization(self): + self.bm25_encoder = BM25Encoder() + assert len(self.bm25_encoder.idx_mapping) != 0 + + def test_call_method(self): + result = self.bm25_encoder(["test"]) + assert isinstance(result, list), "Result should be a list" + assert all( + isinstance(sublist, list) for sublist in result + ), "Each item in result should be a list" diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 0b9842fb..611aff45 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -1,7 +1,7 @@ import pytest from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder -from semantic_router.layer import DecisionLayer # Replace with the actual module name +from semantic_router.layer import DecisionLayer, HybridDecisionLayer # Replace with the actual module name from semantic_router.schema import Decision @@ -111,5 +111,73 @@ def test_failover_score_threshold(self, base_encoder): decision_layer = DecisionLayer(encoder=base_encoder) assert decision_layer.score_threshold == 0.82 +class TestHybridDecisionLayer: + def test_initialization(self, openai_encoder, decisions): + decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + assert decision_layer.score_threshold == 0.82 + assert len(decision_layer.index) == 5 + assert len(set(decision_layer.categories)) == 2 + + def test_initialization_different_encoders(self, cohere_encoder, openai_encoder): + decision_layer_cohere = HybridDecisionLayer(encoder=cohere_encoder) + assert decision_layer_cohere.score_threshold == 0.3 + + decision_layer_openai = HybridDecisionLayer(encoder=openai_encoder) + assert decision_layer_openai.score_threshold == 0.82 + + def test_add_decision(self, openai_encoder): + decision_layer = HybridDecisionLayer(encoder=openai_encoder) + decision = Decision(name="Decision 3", utterances=["Yes", "No"]) + decision_layer.add(decision) + assert len(decision_layer.index) == 2 + assert len(set(decision_layer.categories)) == 1 + + def test_add_multiple_decisions(self, openai_encoder, decisions): + decision_layer = HybridDecisionLayer(encoder=openai_encoder) + for decision in decisions: + decision_layer.add(decision) + assert len(decision_layer.index) == 5 + assert len(set(decision_layer.categories)) == 2 + + def test_query_and_classification(self, openai_encoder, decisions): + decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + query_result = decision_layer("Hello") + assert query_result in ["Decision 1", "Decision 2"] + + def test_query_with_no_index(self, openai_encoder): + decision_layer = HybridDecisionLayer(encoder=openai_encoder) + assert decision_layer("Anything") is None + + def test_semantic_classify(self, openai_encoder, decisions): + decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + classification, score = decision_layer._semantic_classify( + [ + {"decision": "Decision 1", "score": 0.9}, + {"decision": "Decision 2", "score": 0.1}, + ] + ) + assert classification == "Decision 1" + assert score == [0.9] + + def test_semantic_classify_multiple_decisions(self, openai_encoder, decisions): + decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + classification, score = decision_layer._semantic_classify( + [ + {"decision": "Decision 1", "score": 0.9}, + {"decision": "Decision 2", "score": 0.1}, + {"decision": "Decision 1", "score": 0.8}, + ] + ) + assert classification == "Decision 1" + assert score == [0.9, 0.8] + + def test_pass_threshold(self, openai_encoder): + decision_layer = HybridDecisionLayer(encoder=openai_encoder) + assert not decision_layer._pass_threshold([], 0.5) + assert decision_layer._pass_threshold([0.6, 0.7], 0.5) + + def test_failover_score_threshold(self, base_encoder): + decision_layer = HybridDecisionLayer(encoder=base_encoder) + assert decision_layer.score_threshold == 0.82 # Add more tests for edge cases and error handling as needed. From 839c259a1e13c35e4eadb1ec5042d8ceae04cfa3 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:07:51 -0800 Subject: [PATCH 11/16] test fix --- tests/unit/encoders/test_bm25.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py index 8c0e9bc4..086a80bd 100644 --- a/tests/unit/encoders/test_bm25.py +++ b/tests/unit/encoders/test_bm25.py @@ -10,11 +10,11 @@ def bm25_encoder(): class TestBM25Encoder: def test_initialization(self): - self.bm25_encoder = BM25Encoder() - assert len(self.bm25_encoder.idx_mapping) != 0 + bm25_encoder = BM25Encoder() + assert len(bm25_encoder.idx_mapping) != 0 def test_call_method(self): - result = self.bm25_encoder(["test"]) + result = bm25_encoder(["test"]) assert isinstance(result, list), "Result should be a list" assert all( isinstance(sublist, list) for sublist in result From 1eb511c0cc1b8f483fe4c7c5793369ef22d85e4e Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:09:15 -0800 Subject: [PATCH 12/16] lint --- tests/unit/test_layer.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 611aff45..a746c4ec 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -1,7 +1,10 @@ import pytest from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder -from semantic_router.layer import DecisionLayer, HybridDecisionLayer # Replace with the actual module name +from semantic_router.layer import ( + DecisionLayer, + HybridDecisionLayer, +) # Replace with the actual module name from semantic_router.schema import Decision @@ -111,9 +114,12 @@ def test_failover_score_threshold(self, base_encoder): decision_layer = DecisionLayer(encoder=base_encoder) assert decision_layer.score_threshold == 0.82 + class TestHybridDecisionLayer: def test_initialization(self, openai_encoder, decisions): - decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + decision_layer = HybridDecisionLayer( + encoder=openai_encoder, decisions=decisions + ) assert decision_layer.score_threshold == 0.82 assert len(decision_layer.index) == 5 assert len(set(decision_layer.categories)) == 2 @@ -140,7 +146,9 @@ def test_add_multiple_decisions(self, openai_encoder, decisions): assert len(set(decision_layer.categories)) == 2 def test_query_and_classification(self, openai_encoder, decisions): - decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + decision_layer = HybridDecisionLayer( + encoder=openai_encoder, decisions=decisions + ) query_result = decision_layer("Hello") assert query_result in ["Decision 1", "Decision 2"] @@ -149,7 +157,9 @@ def test_query_with_no_index(self, openai_encoder): assert decision_layer("Anything") is None def test_semantic_classify(self, openai_encoder, decisions): - decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + decision_layer = HybridDecisionLayer( + encoder=openai_encoder, decisions=decisions + ) classification, score = decision_layer._semantic_classify( [ {"decision": "Decision 1", "score": 0.9}, @@ -160,7 +170,9 @@ def test_semantic_classify(self, openai_encoder, decisions): assert score == [0.9] def test_semantic_classify_multiple_decisions(self, openai_encoder, decisions): - decision_layer = HybridDecisionLayer(encoder=openai_encoder, decisions=decisions) + decision_layer = HybridDecisionLayer( + encoder=openai_encoder, decisions=decisions + ) classification, score = decision_layer._semantic_classify( [ {"decision": "Decision 1", "score": 0.9}, @@ -180,4 +192,5 @@ def test_failover_score_threshold(self, base_encoder): decision_layer = HybridDecisionLayer(encoder=base_encoder) assert decision_layer.score_threshold == 0.82 + # Add more tests for edge cases and error handling as needed. From 55cd79da566b0d05c6c27b01cce02c7c047f3f91 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:22:00 -0800 Subject: [PATCH 13/16] fix bm25 test --- tests/unit/encoders/test_bm25.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py index 086a80bd..3a94602e 100644 --- a/tests/unit/encoders/test_bm25.py +++ b/tests/unit/encoders/test_bm25.py @@ -9,11 +9,11 @@ def bm25_encoder(): class TestBM25Encoder: - def test_initialization(self): + def test_initialization(self, bm25_encoder): bm25_encoder = BM25Encoder() assert len(bm25_encoder.idx_mapping) != 0 - def test_call_method(self): + def test_call_method(self, bm25_encoder): result = bm25_encoder(["test"]) assert isinstance(result, list), "Result should be a list" assert all( From 8d201d5163270b703a0fc7481c431d8c0bb12662 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:37:00 -0800 Subject: [PATCH 14/16] add coverage for bm25 fit --- tests/unit/encoders/test_bm25.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py index 3a94602e..9822d29b 100644 --- a/tests/unit/encoders/test_bm25.py +++ b/tests/unit/encoders/test_bm25.py @@ -10,7 +10,14 @@ def bm25_encoder(): class TestBM25Encoder: def test_initialization(self, bm25_encoder): - bm25_encoder = BM25Encoder() + assert len(bm25_encoder.idx_mapping) != 0 + + def test_fit(self, bm25_encoder): + bm25_encoder.fit([ + "some docs", + "and more docs", + "and even more docs" + ]) assert len(bm25_encoder.idx_mapping) != 0 def test_call_method(self, bm25_encoder): From fde33553beba972becf46ad473895e1949921c55 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:39:28 -0800 Subject: [PATCH 15/16] lint --- tests/unit/encoders/test_bm25.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py index 9822d29b..e0f6dad5 100644 --- a/tests/unit/encoders/test_bm25.py +++ b/tests/unit/encoders/test_bm25.py @@ -13,11 +13,7 @@ def test_initialization(self, bm25_encoder): assert len(bm25_encoder.idx_mapping) != 0 def test_fit(self, bm25_encoder): - bm25_encoder.fit([ - "some docs", - "and more docs", - "and even more docs" - ]) + bm25_encoder.fit(["some docs", "and more docs", "and even more docs"]) assert len(bm25_encoder.idx_mapping) != 0 def test_call_method(self, bm25_encoder): From bc976017307136f3daf98424d399761af29e8091 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:48:18 -0800 Subject: [PATCH 16/16] add more coverage to bm25 --- tests/unit/encoders/test_bm25.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/unit/encoders/test_bm25.py b/tests/unit/encoders/test_bm25.py index e0f6dad5..c1987151 100644 --- a/tests/unit/encoders/test_bm25.py +++ b/tests/unit/encoders/test_bm25.py @@ -22,3 +22,14 @@ def test_call_method(self, bm25_encoder): assert all( isinstance(sublist, list) for sublist in result ), "Each item in result should be a list" + + def test_call_method_no_docs(self, bm25_encoder): + with pytest.raises(ValueError): + bm25_encoder([]) + + def test_call_method_no_word(self, bm25_encoder): + result = bm25_encoder(["doc with fake word gta5jabcxyz"]) + assert isinstance(result, list), "Result should be a list" + assert all( + isinstance(sublist, list) for sublist in result + ), "Each item in result should be a list"