From 529dbdcd88ece7a716609cd953989e0607a688d7 Mon Sep 17 00:00:00 2001 From: Woyten Tielesch Date: Thu, 14 Nov 2024 17:17:19 +0100 Subject: [PATCH] Support instructable_embed endpoint --- Changelog.md | 4 + README.md | 1 + aleph_alpha_client/aleph_alpha_client.py | 115 ++++++++++++++++++++++- aleph_alpha_client/chat.py | 23 +++-- aleph_alpha_client/embedding.py | 105 ++++++++++++++++----- aleph_alpha_client/version.py | 8 +- tests/test_chat.py | 52 +++++----- tests/test_embed.py | 38 +++++++- 8 files changed, 289 insertions(+), 57 deletions(-) diff --git a/Changelog.md b/Changelog.md index 8931cce..dd1d9d6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,9 @@ # Changelog +## 7.6.0 + +- Add `instructable_embed` to `Client` and `AsyncClient` + ## 7.5.1 - Add fallback mechanism for figuring out the version locally. diff --git a/README.md b/README.md index a76f4dd..363f7ef 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ async with AsyncClient(token=os.environ["AA_TOKEN"]) as client: maximum_tokens=64, ) response = client.complete_with_streaming(request, model="luminous-base") + async for stream_item in response: print(stream_item) ``` diff --git a/aleph_alpha_client/aleph_alpha_client.py b/aleph_alpha_client/aleph_alpha_client.py index cd7900a..985a07c 100644 --- a/aleph_alpha_client/aleph_alpha_client.py +++ b/aleph_alpha_client/aleph_alpha_client.py @@ -37,7 +37,13 @@ CompletionResponseStreamItem, stream_item_from_json, ) -from aleph_alpha_client.chat import ChatRequest, ChatResponse, ChatStreamChunk, ChatStreamChunk, Usage, stream_chat_item_from_json +from aleph_alpha_client.chat import ( + ChatRequest, + ChatResponse, + ChatStreamChunk, + Usage, + stream_chat_item_from_json, +) from aleph_alpha_client.evaluation import EvaluationRequest, EvaluationResponse from aleph_alpha_client.tokenization import TokenizationRequest, TokenizationResponse from aleph_alpha_client.detokenization import ( @@ -50,6 +56,8 @@ EmbeddingRequest, EmbeddingResponse, EmbeddingVector, + InstructableEmbeddingRequest, + InstructableEmbeddingResponse, SemanticEmbeddingRequest, SemanticEmbeddingResponse, ) @@ -104,6 +112,7 @@ def _check_api_version(version_str: str): TokenizationRequest, DetokenizationRequest, SemanticEmbeddingRequest, + InstructableEmbeddingRequest, BatchSemanticEmbeddingRequest, QaRequest, SummarizationRequest, @@ -514,6 +523,58 @@ def batch_semantic_embed( num_tokens_prompt_total=num_tokens_prompt_total, ) + def instructable_embed( + self, + request: InstructableEmbeddingRequest, + model: str, + ) -> InstructableEmbeddingResponse: + """Embeds a text and returns vectors that can be used for classification according to a given instruction. + + Parameters: + request (InstructableEmbeddingRequest, required): + Parameters for the requested instructable embedding. + + model (string, required): + Name of model to use. A model name refers to a model architecture (number of parameters among others). + Always the latest version of model is used. + + Examples: + >>> # function for salutation embedding + >>> def embed_salutation(text: str): + # Create an embeddingrequest with a given instruction + request = InstructableEmbeddingRequest( + input=Prompt.from_text(text), + instruction="Represent the text to query a database of salutations" + ) + # create the embedding + result = client.instructable_embed(request, model=model_name) + return result.embedding + >>> + >>> # function to calculate similarity + >>> def cosine_similarity(v1: Sequence[float], v2: Sequence[float]) -> float: + "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)" + sumxx, sumxy, sumyy = 0, 0, 0 + for i in range(len(v1)): + x = v1[i]; y = v2[i] + sumxx += x*x + sumyy += y*y + sumxy += x*y + return sumxy/math.sqrt(sumxx*sumyy) + >>> + >>> # define the texts + >>> text_a = "Hello" + >>> text_b = "Good morning" + >>> + >>> # show the similarity + >>> print(cosine_similarity(embed_salutation(text_a), embed_salutation(text_b))) + """ + response = self._post_request( + "instructable_embed", + request, + model, + ) + return InstructableEmbeddingResponse.from_json(response) + def evaluate( self, request: EvaluationRequest, @@ -1206,6 +1267,58 @@ async def batch_semantic_embed( num_tokens_prompt_total=num_tokens_prompt_total, ) + async def instructable_embed( + self, + request: InstructableEmbeddingRequest, + model: str, + ) -> InstructableEmbeddingResponse: + """Embeds a text and returns vectors that can be used for classification according to a given instruction. + + Parameters: + request (InstructableEmbeddingRequest, required): + Parameters for the requested instructable embedding. + + model (string, required): + Name of model to use. A model name refers to a model architecture (number of parameters among others). + Always the latest version of model is used. + + Examples: + >>> # function for salutation embedding + >>> async def embed_salutation(text: str): + # Create an embeddingrequest with a given instruction + request = InstructableEmbeddingRequest( + input=Prompt.from_text(text), + instruction="Represent the text to query a database of salutations" + ) + # create the embedding + result = await client.instructable_embed(request, model=model_name) + return result.embedding + >>> + >>> # function to calculate similarity + >>> def cosine_similarity(v1: Sequence[float], v2: Sequence[float]) -> float: + "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)" + sumxx, sumxy, sumyy = 0, 0, 0 + for i in range(len(v1)): + x = v1[i]; y = v2[i] + sumxx += x*x + sumyy += y*y + sumxy += x*y + return sumxy/math.sqrt(sumxx*sumyy) + >>> + >>> # define the texts + >>> text_a = "Hello" + >>> text_b = "Good morning" + >>> + >>> # show the similarity + >>> print(cosine_similarity(await embed_salutation(text_a), await embed_salutation(text_b))) + """ + response = await self._post_request( + "instructable_embed", + request, + model, + ) + return InstructableEmbeddingResponse.from_json(response) + async def evaluate( self, request: EvaluationRequest, diff --git a/aleph_alpha_client/chat.py b/aleph_alpha_client/chat.py index 1e22a12..33b279f 100644 --- a/aleph_alpha_client/chat.py +++ b/aleph_alpha_client/chat.py @@ -5,6 +5,7 @@ class Role(str, Enum): """A role used for a message in a chat.""" + User = "user" Assistant = "assistant" System = "system" @@ -14,7 +15,7 @@ class Role(str, Enum): class Message: """ Describes a message in a chat. - + Parameters: role (Role, required): The role of the message. @@ -22,6 +23,7 @@ class Message: content (str, required): The content of the message. """ + role: Role content: str @@ -41,6 +43,7 @@ class StreamOptions: """ Additional options to affect the streaming behavior. """ + # If set, an additional chunk will be streamed before the data: [DONE] message. # The usage field on this chunk shows the token usage statistics for the entire # request, and the choices field will always be an empty array. @@ -51,10 +54,11 @@ class StreamOptions: class ChatRequest: """ Describes a chat request. - + Only supports a subset of the parameters of `CompletionRequest` for simplicity. See `CompletionRequest` for documentation on the parameters. """ + model: str messages: List[Message] maximum_tokens: Optional[int] = None @@ -77,6 +81,7 @@ class ChatResponse: As the `ChatRequest` does not support the `n` parameter (allowing for multiple return values), the `ChatResponse` assumes there to be only one choice. """ + finish_reason: str message: Message @@ -89,7 +94,6 @@ def from_json(json: Dict[str, Any]) -> "ChatResponse": ) - @dataclass(frozen=True) class Usage: """ @@ -98,6 +102,7 @@ class Usage: When streaming is enabled, this field will be null by default. To include an additional usage-only message in the response stream, set stream_options.include_usage to true. """ + # Number of tokens in the generated completion. completion_tokens: int @@ -112,11 +117,10 @@ def from_json(json: Dict[str, Any]) -> "Usage": return Usage( completion_tokens=json["completion_tokens"], prompt_tokens=json["prompt_tokens"], - total_tokens=json["total_tokens"] + total_tokens=json["total_tokens"], ) - @dataclass(frozen=True) class ChatStreamChunk: """ @@ -128,7 +132,8 @@ class ChatStreamChunk: role (Role, optional): The role of the current chat completion. Will be assistant for the first chunk of every completion stream and missing for the remaining chunks. - """ + """ + content: str role: Optional[Role] @@ -146,8 +151,10 @@ def from_json(json: Dict[str, Any]) -> Optional["ChatStreamChunk"]: ) -def stream_chat_item_from_json(json: Dict[str, Any]) -> Union[Usage, ChatStreamChunk, None]: +def stream_chat_item_from_json( + json: Dict[str, Any], +) -> Union[Usage, ChatStreamChunk, None]: if (usage := json.get("usage")) is not None: return Usage.from_json(usage) - return ChatStreamChunk.from_json(json) \ No newline at end of file + return ChatStreamChunk.from_json(json) diff --git a/aleph_alpha_client/embedding.py b/aleph_alpha_client/embedding.py index d565bd7..f4e28d4 100644 --- a/aleph_alpha_client/embedding.py +++ b/aleph_alpha_client/embedding.py @@ -20,14 +20,12 @@ class EmbeddingRequest: Parameters: prompt The text and/or image(s) to be embedded. - layers A list of layer indices from which to return embeddings. * Index 0 corresponds to the word embeddings used as input to the first transformer layer * Index 1 corresponds to the hidden state as output by the first transformer layer, index 2 to the output of the second layer etc. * Index -1 corresponds to the last transformer layer (not the language modelling head), index -2 to the second last layer etc. - pooling Pooling operation to use. Pooling operations include: @@ -36,19 +34,12 @@ class EmbeddingRequest: * max: aggregate token embeddings across the sequence dimension using a maximum * last_token: just use the last token * abs_max: aggregate token embeddings across the sequence dimension using a maximum of absolute values - type Type of the embedding (e.g. symmetric or asymmetric) - tokens Flag indicating whether the tokenized prompt is to be returned (True) or not (False) - normalize Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric. - - Note that at the moment this parameter does not yet have any effect. This will change as soon as the - corresponding feature is available in the backend - contextual_control_threshold (float, default None) If set to None, attention control parameters only apply to those tokens that have explicitly been set in the request. @@ -56,7 +47,6 @@ class EmbeddingRequest: Controls that have been applied to one token will then be applied to all other tokens that have at least the similarity score defined by this parameter. The similarity score is the cosine similarity of token embeddings. - control_log_additive (bool, default True) True: apply control by adding the log(control_factor) to attention scores. False: apply control by (attention_scores - - attention_scores.min(-1)) * control_factor @@ -150,13 +140,8 @@ class SemanticEmbeddingRequest: The 128 size is expected to have a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical. The 128 size can also perform better if you are embedding really short texts or documents. - normalize Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric. - - Note that at the moment this parameter does not yet have any effect. This will change as soon as the - corresponding feature is available in the backend - contextual_control_threshold (float, default None) If set to None, attention control parameters only apply to those tokens that have explicitly been set in the request. @@ -164,7 +149,6 @@ class SemanticEmbeddingRequest: Controls that have been applied to one token will then be applied to all other tokens that have at least the similarity score defined by this parameter. The similarity score is the cosine similarity of token embeddings. - control_log_additive (bool, default True) True: apply control by adding the log(control_factor) to attention scores. False: apply control by (attention_scores - - attention_scores.min(-1)) * control_factor @@ -223,13 +207,8 @@ class BatchSemanticEmbeddingRequest: The 128 size is expected to have a small drop in accuracy performance (4-6%), with the benefit of being much smaller, which makes comparing these embeddings much faster for use cases where speed is critical. The 128 size can also perform better if you are embedding really short texts or documents. - normalize Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric. - - Note that at the moment this parameter does not yet have any effect. This will change as soon as the - corresponding feature is available in the backend - contextual_control_threshold (float, default None) If set to None, attention control parameters only apply to those tokens that have explicitly been set in the request. @@ -237,7 +216,6 @@ class BatchSemanticEmbeddingRequest: Controls that have been applied to one token will then be applied to all other tokens that have at least the similarity score defined by this parameter. The similarity score is the cosine similarity of token embeddings. - control_log_additive (bool, default True) True: apply control by adding the log(control_factor) to attention scores. False: apply control by (attention_scores - - attention_scores.min(-1)) * control_factor @@ -272,6 +250,63 @@ def _asdict(self) -> Mapping[str, Any]: return asdict(self) +@dataclass(frozen=True) +class InstructableEmbeddingRequest: + """ + Embeds a text and returns vectors that can be used for classification according to a given instruction. + + Parameters: + input + The text and/or image(s) to be embedded. + instruction + An instruction specifying the aspect to attend to when generating the embedding. + normalize + Return normalized embeddings. This can be used to save on additional compute when applying a cosine similarity metric. + contextual_control_threshold (float, default None) + If set to None, attention control parameters only apply to those tokens that have + explicitly been set in the request. + If set to a non-None value, we apply the control parameters to similar tokens as well. + Controls that have been applied to one token will then be applied to all other tokens + that have at least the similarity score defined by this parameter. + The similarity score is the cosine similarity of token embeddings. + control_log_additive (bool, default True) + True: apply control by adding the log(control_factor) to attention scores. + False: apply control by (attention_scores - - attention_scores.min(-1)) * control_factor + + Examples + >>> texts = [ + "deep learning", + "artificial intelligence", + "deep diving", + "artificial snow", + ] + >>> # Texts to compare + >>> embeddings = [] + >>> for text in texts: + request = InstructableEmbeddingRequest( + input=Prompt.from_text(text), + instruction="Represent the text to query a database of technical concepts", + ) + result = model.instructable_embed(request) + embeddings.append(result.embedding) + """ + + input: Prompt + instruction: str + normalize: bool = False + contextual_control_threshold: Optional[float] = None + control_log_additive: Optional[bool] = True + + def to_json(self) -> Mapping[str, Any]: + return { + **self._asdict(), + "input": self.input.to_json(), + } + + def _asdict(self) -> Mapping[str, Any]: + return asdict(self) + + EmbeddingVector = List[float] @@ -345,3 +380,29 @@ def _from_model_version_and_embeddings( embeddings=embeddings, num_tokens_prompt_total=num_tokens_prompt_total, ) + + +@dataclass(frozen=True) +class InstructableEmbeddingResponse: + """ + Response of an instructable embedding request + + Parameters: + model_version + Model name and version (if any) of the used model for inference + embedding + A list of floats that can be used to compare against other embeddings. + """ + + model_version: str + embedding: EmbeddingVector + num_tokens_prompt_total: int + message: Optional[str] = None + + @staticmethod + def from_json(json: Dict[str, Any]) -> "InstructableEmbeddingResponse": + return InstructableEmbeddingResponse( + model_version=json["model_version"], + embedding=json["embedding"], + num_tokens_prompt_total=json["num_tokens_prompt_total"], + ) diff --git a/aleph_alpha_client/version.py b/aleph_alpha_client/version.py index 452d58b..d2d20ab 100644 --- a/aleph_alpha_client/version.py +++ b/aleph_alpha_client/version.py @@ -27,7 +27,7 @@ def pyproject_version() -> str: """ NO_VERSION = "0.0.0" pyproject_path = Path(__file__).resolve().parent.parent / "pyproject.toml" - + if not pyproject_path.is_file(): logging.error("pyproject.toml file not found.") return NO_VERSION @@ -36,8 +36,8 @@ def pyproject_version() -> str: with pyproject_path.open("r", encoding="utf-8") as file: content = file.read() - - if (match := version_pattern.search(content)): + + if match := version_pattern.search(content): return match.group(1) logging.error("Version not found in pyproject.toml") @@ -63,4 +63,4 @@ def user_agent_headers() -> Dict[str, str]: if __version__ == "0.0.0": return {} else: - return {"User-Agent": "Aleph-Alpha-Python-Client-" + __version__} \ No newline at end of file + return {"User-Agent": "Aleph-Alpha-Python-Client-" + __version__} diff --git a/tests/test_chat.py b/tests/test_chat.py index 13c7598..eb006be 100644 --- a/tests/test_chat.py +++ b/tests/test_chat.py @@ -1,7 +1,15 @@ import pytest from aleph_alpha_client import AsyncClient, Client -from aleph_alpha_client.chat import ChatRequest, Message, Role, StreamOptions, stream_chat_item_from_json, Usage, ChatStreamChunk +from aleph_alpha_client.chat import ( + ChatRequest, + Message, + Role, + StreamOptions, + stream_chat_item_from_json, + Usage, + ChatStreamChunk, +) from tests.common import async_client, sync_client, model_name, chat_model_name @@ -27,7 +35,9 @@ def test_can_chat_with_chat_model(sync_client: Client, chat_model_name: str): assert response.message.content is not None -async def test_can_chat_with_async_client(async_client: AsyncClient, chat_model_name: str): +async def test_can_chat_with_async_client( + async_client: AsyncClient, chat_model_name: str +): system_msg = Message(role=Role.System, content="You are a helpful assistant.") user_msg = Message(role=Role.User, content="Hello, how are you?") request = ChatRequest( @@ -40,19 +50,27 @@ async def test_can_chat_with_async_client(async_client: AsyncClient, chat_model_ assert response.message.content is not None -async def test_can_chat_with_streaming_support(async_client: AsyncClient, chat_model_name: str): +async def test_can_chat_with_streaming_support( + async_client: AsyncClient, chat_model_name: str +): request = ChatRequest( messages=[Message(role=Role.User, content="Hello, how are you?")], model=chat_model_name, ) stream_items = [ - stream_item async for stream_item in async_client.chat_with_streaming(request, model=chat_model_name) + stream_item + async for stream_item in async_client.chat_with_streaming( + request, model=chat_model_name + ) ] first = stream_items[0] assert isinstance(first, ChatStreamChunk) and first.role is not None - assert all(isinstance(item, ChatStreamChunk) and item.content is not None for item in stream_items[1:]) + assert all( + isinstance(item, ChatStreamChunk) and item.content is not None + for item in stream_items[1:] + ) async def test_usage_response_is_parsed(): @@ -63,11 +81,7 @@ async def test_usage_response_is_parsed(): "model": "llama-3.1-70b-instruct", "system_fingerprint": ".unknown.", "object": "chat.completion.chunk", - "usage": { - "prompt_tokens": 31, - "completion_tokens": 88, - "total_tokens": 119 - } + "usage": {"prompt_tokens": 31, "completion_tokens": 88, "total_tokens": 119}, } # When parsing it @@ -85,10 +99,8 @@ def test_chunk_response_is_parsed(): { "finish_reason": None, "index": 0, - "delta": { - "content": " way, those clothes you're wearing" - }, - "logprobs": None + "delta": {"content": " way, those clothes you're wearing"}, + "logprobs": None, } ], "created": 1730133401, @@ -106,25 +118,23 @@ def test_chunk_response_is_parsed(): assert result.content == " way, those clothes you're wearing" - async def test_stream_options(async_client: AsyncClient, chat_model_name: str): # Given a request with include usage options set stream_options = StreamOptions(include_usage=True) request = ChatRequest( messages=[Message(role=Role.User, content="Hello, how are you?")], model=chat_model_name, - stream_options=stream_options - + stream_options=stream_options, ) # When receiving the chunks stream_items = [ - stream_item async for stream_item in async_client.chat_with_streaming(request, model=chat_model_name) + stream_item + async for stream_item in async_client.chat_with_streaming( + request, model=chat_model_name + ) ] # Then the last chunks has information about usage assert all(isinstance(item, ChatStreamChunk) for item in stream_items[:-1]) assert isinstance(stream_items[-1], Usage) - - - \ No newline at end of file diff --git a/tests/test_embed.py b/tests/test_embed.py index bcad027..8834eda 100644 --- a/tests/test_embed.py +++ b/tests/test_embed.py @@ -9,6 +9,7 @@ from aleph_alpha_client.aleph_alpha_client import AsyncClient, Client from aleph_alpha_client.embedding import ( BatchSemanticEmbeddingRequest, + InstructableEmbeddingRequest, SemanticEmbeddingRequest, SemanticRepresentation, BatchSemanticEmbeddingResponse, @@ -93,7 +94,7 @@ async def test_batch_embed_semantic_with_async_client( @pytest.mark.parametrize("batch_size", [-1, 0, 101]) async def test_batch_embed_semantic_invalid_batch_sizes( - async_client: AsyncClient, sync_client: Client, batch_size: int + async_client: AsyncClient, batch_size: int ): words = ["car", "elephant", "kitchen sink", "rubber", "sun"] request = BatchSemanticEmbeddingRequest( @@ -105,6 +106,24 @@ async def test_batch_embed_semantic_invalid_batch_sizes( await async_client.batch_semantic_embed(request=request, batch_size=batch_size) +@pytest.mark.system_test +async def test_can_instructable_embed_with_async_client( + async_client: AsyncClient, +): + request = InstructableEmbeddingRequest( + input=Prompt.from_text("hello"), + instruction="Represent the text to query a database of salutations", + ) + + response = await async_client.instructable_embed( + request, model="Pharia-1-Embedding-4608-control" + ) + assert response.model_version is not None + assert response.embedding + assert len(response.embedding) == 4608 + assert response.num_tokens_prompt_total >= 1 + + def cosine_similarity(emb1: Sequence[float], emb2: Sequence[float]) -> float: "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)" sumxx, sumxy, sumyy = 0.0, 0.0, 0.0 @@ -214,6 +233,23 @@ def test_embed_semantic(sync_client: Client): assert result.num_tokens_prompt_total >= 1 +@pytest.mark.system_test +def test_embed_instructable(sync_client: Client): + request = InstructableEmbeddingRequest( + input=Prompt.from_text("hello"), + instruction="Represent the text to query a database of salutations", + ) + + result = sync_client.instructable_embed( + request=request, model="Pharia-1-Embedding-4608-control" + ) + + assert result.model_version is not None + assert result.embedding + assert len(result.embedding) == 4608 + assert result.num_tokens_prompt_total >= 1 + + @pytest.mark.parametrize("num_prompts", [1, 100, 101, 200]) @pytest.mark.system_test def test_batch_embed_semantic(sync_client: Client, num_prompts: int):