From 642427ec514387452df5030598b83ab686211442 Mon Sep 17 00:00:00 2001 From: Steve Liu Date: Thu, 21 Nov 2024 17:25:54 -0800 Subject: [PATCH] feat: Add Contract Tests for new Gen AI attributes for foundational models (#292) contract tests for new gen_ai inference parameters added in https://github.com/aws-observability/aws-otel-python-instrumentation/pull/290 image By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: Michael He <53622546+yiyuan-he@users.noreply.github.com> --- .../applications/botocore/botocore_server.py | 164 +++++++++++++-- .../test/amazon/base/contract_test_base.py | 8 +- .../test/amazon/botocore/botocore_test.py | 186 ++++++++++++++++-- 3 files changed, 327 insertions(+), 31 deletions(-) diff --git a/contract-tests/images/applications/botocore/botocore_server.py b/contract-tests/images/applications/botocore/botocore_server.py index f16948390..d1736d56c 100644 --- a/contract-tests/images/applications/botocore/botocore_server.py +++ b/contract-tests/images/applications/botocore/botocore_server.py @@ -6,6 +6,7 @@ import tempfile from collections import namedtuple from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from io import BytesIO from threading import Thread import boto3 @@ -13,6 +14,7 @@ from botocore.client import BaseClient from botocore.config import Config from botocore.exceptions import ClientError +from botocore.response import StreamingBody from typing_extensions import Tuple, override _PORT: int = 8080 @@ -285,28 +287,22 @@ def _handle_bedrock_request(self) -> None: }, ) elif self.in_path("invokemodel/invoke-model"): + model_id, request_body, response_body = get_model_request_response(self.path) + set_main_status(200) bedrock_runtime_client.meta.events.register( "before-call.bedrock-runtime.InvokeModel", - inject_200_success, - ) - model_id = "amazon.titan-text-premier-v1:0" - user_message = "Describe the purpose of a 'hello world' program in one line." - prompt = f"[INST] {user_message} [/INST]" - body = json.dumps( - { - "inputText": prompt, - "textGenerationConfig": { - "maxTokenCount": 3072, - "stopSequences": [], - "temperature": 0.7, - "topP": 0.9, - }, - } + lambda **kwargs: inject_200_success( + modelId=model_id, + body=response_body, + **kwargs, + ), ) accept = "application/json" content_type = "application/json" - bedrock_runtime_client.invoke_model(body=body, modelId=model_id, accept=accept, contentType=content_type) + bedrock_runtime_client.invoke_model( + body=request_body, modelId=model_id, accept=accept, contentType=content_type + ) else: set_main_status(404) @@ -378,6 +374,137 @@ def _end_request(self, status_code: int): self.end_headers() +def get_model_request_response(path): + prompt = "Describe the purpose of a 'hello world' program in one line." + model_id = "" + request_body = {} + response_body = {} + + if "amazon.titan" in path: + model_id = "amazon.titan-text-premier-v1:0" + + request_body = { + "inputText": prompt, + "textGenerationConfig": { + "maxTokenCount": 3072, + "stopSequences": [], + "temperature": 0.7, + "topP": 0.9, + }, + } + + response_body = { + "inputTextTokenCount": 15, + "results": [ + { + "tokenCount": 13, + "outputText": "text-test-response", + "completionReason": "CONTENT_FILTERED", + }, + ], + } + + if "anthropic.claude" in path: + model_id = "anthropic.claude-v2:1" + + request_body = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1000, + "temperature": 0.99, + "top_p": 1, + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": prompt}], + }, + ], + } + + response_body = { + "stop_reason": "end_turn", + "usage": { + "input_tokens": 15, + "output_tokens": 13, + }, + } + + if "meta.llama" in path: + model_id = "meta.llama2-13b-chat-v1" + + request_body = {"prompt": prompt, "max_gen_len": 512, "temperature": 0.5, "top_p": 0.9} + + response_body = {"prompt_token_count": 31, "generation_token_count": 49, "stop_reason": "stop"} + + if "cohere.command" in path: + model_id = "cohere.command-r-v1:0" + + request_body = { + "chat_history": [], + "message": prompt, + "max_tokens": 512, + "temperature": 0.5, + "p": 0.65, + } + + response_body = { + "chat_history": [ + {"role": "USER", "message": prompt}, + {"role": "CHATBOT", "message": "test-text-output"}, + ], + "finish_reason": "COMPLETE", + "text": "test-generation-text", + } + + if "ai21.jamba" in path: + model_id = "ai21.jamba-1-5-large-v1:0" + + request_body = { + "messages": [ + { + "role": "user", + "content": prompt, + }, + ], + "top_p": 0.8, + "temperature": 0.6, + "max_tokens": 512, + } + + response_body = { + "stop_reason": "end_turn", + "usage": { + "prompt_tokens": 21, + "completion_tokens": 24, + }, + "choices": [ + {"finish_reason": "stop"}, + ], + } + + if "mistral" in path: + model_id = "mistral.mistral-7b-instruct-v0:2" + + request_body = { + "prompt": prompt, + "max_tokens": 4096, + "temperature": 0.75, + "top_p": 0.99, + } + + response_body = { + "outputs": [ + { + "text": "test-output-text", + "stop_reason": "stop", + }, + ] + } + + json_bytes = json.dumps(response_body).encode("utf-8") + + return model_id, json.dumps(request_body), StreamingBody(BytesIO(json_bytes), len(json_bytes)) + + def set_main_status(status: int) -> None: RequestHandler.main_status = status @@ -490,11 +617,16 @@ def inject_200_success(**kwargs): guardrail_arn = kwargs.get("guardrailArn") if guardrail_arn is not None: response_body["guardrailArn"] = guardrail_arn + model_id = kwargs.get("modelId") + if model_id is not None: + response_body["modelId"] = model_id HTTPResponse = namedtuple("HTTPResponse", ["status_code", "headers", "body"]) headers = kwargs.get("headers", {}) body = kwargs.get("body", "") + response_body["body"] = body http_response = HTTPResponse(200, headers=headers, body=body) + return http_response, response_body diff --git a/contract-tests/tests/test/amazon/base/contract_test_base.py b/contract-tests/tests/test/amazon/base/contract_test_base.py index ba96530b0..64569450b 100644 --- a/contract-tests/tests/test/amazon/base/contract_test_base.py +++ b/contract-tests/tests/test/amazon/base/contract_test_base.py @@ -173,6 +173,12 @@ def _assert_int_attribute(self, attributes_dict: Dict[str, AnyValue], key: str, self.assertIsNotNone(actual_value) self.assertEqual(expected_value, actual_value.int_value) + def _assert_float_attribute(self, attributes_dict: Dict[str, AnyValue], key: str, expected_value: float) -> None: + self.assertIn(key, attributes_dict) + actual_value: AnyValue = attributes_dict[key] + self.assertIsNotNone(actual_value) + self.assertEqual(expected_value, actual_value.double_value) + def _assert_match_attribute(self, attributes_dict: Dict[str, AnyValue], key: str, pattern: str) -> None: self.assertIn(key, attributes_dict) actual_value: AnyValue = attributes_dict[key] @@ -237,5 +243,5 @@ def _is_valid_regex(self, pattern: str) -> bool: try: re.compile(pattern) return True - except re.error: + except (re.error, StopIteration, RuntimeError, KeyError): return False diff --git a/contract-tests/tests/test/amazon/botocore/botocore_test.py b/contract-tests/tests/test/amazon/botocore/botocore_test.py index f5ae91a59..b2821a8b6 100644 --- a/contract-tests/tests/test/amazon/botocore/botocore_test.py +++ b/contract-tests/tests/test/amazon/botocore/botocore_test.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import math from logging import INFO, Logger, getLogger from typing import Dict, List @@ -34,13 +35,21 @@ _AWS_BEDROCK_GUARDRAIL_ID: str = "aws.bedrock.guardrail.id" _AWS_BEDROCK_KNOWLEDGE_BASE_ID: str = "aws.bedrock.knowledge_base.id" _AWS_BEDROCK_DATA_SOURCE_ID: str = "aws.bedrock.data_source.id" + _GEN_AI_REQUEST_MODEL: str = "gen_ai.request.model" +_GEN_AI_REQUEST_TEMPERATURE: str = "gen_ai.request.temperature" +_GEN_AI_REQUEST_TOP_P: str = "gen_ai.request.top_p" +_GEN_AI_REQUEST_MAX_TOKENS: str = "gen_ai.request.max_tokens" +_GEN_AI_RESPONSE_FINISH_REASONS: str = "gen_ai.response.finish_reasons" +_GEN_AI_USAGE_INPUT_TOKENS: str = "gen_ai.usage.input_tokens" +_GEN_AI_USAGE_OUTPUT_TOKENS: str = "gen_ai.usage.output_tokens" + _AWS_SECRET_ARN: str = "aws.secretsmanager.secret.arn" _AWS_STATE_MACHINE_ARN: str = "aws.stepfunctions.state_machine.arn" _AWS_ACTIVITY_ARN: str = "aws.stepfunctions.activity.arn" -# pylint: disable=too-many-public-methods +# pylint: disable=too-many-public-methods,too-many-lines class BotocoreTest(ContractTestBase): _local_stack: LocalStackContainer @@ -403,9 +412,9 @@ def test_kinesis_fault(self): span_name="Kinesis.PutRecord", ) - def test_bedrock_runtime_invoke_model(self): + def test_bedrock_runtime_invoke_model_amazon_titan(self): self.do_test_requests( - "bedrock/invokemodel/invoke-model", + "bedrock/invokemodel/invoke-model/amazon.titan-text-premier-v1:0", "GET", 200, 0, @@ -418,6 +427,153 @@ def test_bedrock_runtime_invoke_model(self): cloudformation_primary_identifier="amazon.titan-text-premier-v1:0", request_specific_attributes={ _GEN_AI_REQUEST_MODEL: "amazon.titan-text-premier-v1:0", + _GEN_AI_REQUEST_MAX_TOKENS: 3072, + _GEN_AI_REQUEST_TEMPERATURE: 0.7, + _GEN_AI_REQUEST_TOP_P: 0.9, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["CONTENT_FILTERED"], + _GEN_AI_USAGE_INPUT_TOKENS: 15, + _GEN_AI_USAGE_OUTPUT_TOKENS: 13, + }, + span_name="Bedrock Runtime.InvokeModel", + ) + + def test_bedrock_runtime_invoke_model_anthropic_claude(self): + self.do_test_requests( + "bedrock/invokemodel/invoke-model/anthropic.claude-v2:1", + "GET", + 200, + 0, + 0, + rpc_service="Bedrock Runtime", + remote_service="AWS::BedrockRuntime", + remote_operation="InvokeModel", + remote_resource_type="AWS::Bedrock::Model", + remote_resource_identifier="anthropic.claude-v2:1", + cloudformation_primary_identifier="anthropic.claude-v2:1", + request_specific_attributes={ + _GEN_AI_REQUEST_MODEL: "anthropic.claude-v2:1", + _GEN_AI_REQUEST_MAX_TOKENS: 1000, + _GEN_AI_REQUEST_TEMPERATURE: 0.99, + _GEN_AI_REQUEST_TOP_P: 1, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["end_turn"], + _GEN_AI_USAGE_INPUT_TOKENS: 15, + _GEN_AI_USAGE_OUTPUT_TOKENS: 13, + }, + span_name="Bedrock Runtime.InvokeModel", + ) + + def test_bedrock_runtime_invoke_model_meta_llama(self): + self.do_test_requests( + "bedrock/invokemodel/invoke-model/meta.llama2-13b-chat-v1", + "GET", + 200, + 0, + 0, + rpc_service="Bedrock Runtime", + remote_service="AWS::BedrockRuntime", + remote_operation="InvokeModel", + remote_resource_type="AWS::Bedrock::Model", + remote_resource_identifier="meta.llama2-13b-chat-v1", + cloudformation_primary_identifier="meta.llama2-13b-chat-v1", + request_specific_attributes={ + _GEN_AI_REQUEST_MODEL: "meta.llama2-13b-chat-v1", + _GEN_AI_REQUEST_MAX_TOKENS: 512, + _GEN_AI_REQUEST_TEMPERATURE: 0.5, + _GEN_AI_REQUEST_TOP_P: 0.9, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"], + _GEN_AI_USAGE_INPUT_TOKENS: 31, + _GEN_AI_USAGE_OUTPUT_TOKENS: 49, + }, + span_name="Bedrock Runtime.InvokeModel", + ) + + def test_bedrock_runtime_invoke_model_cohere_command(self): + self.do_test_requests( + "bedrock/invokemodel/invoke-model/cohere.command-r-v1:0", + "GET", + 200, + 0, + 0, + rpc_service="Bedrock Runtime", + remote_service="AWS::BedrockRuntime", + remote_operation="InvokeModel", + remote_resource_type="AWS::Bedrock::Model", + remote_resource_identifier="cohere.command-r-v1:0", + cloudformation_primary_identifier="cohere.command-r-v1:0", + request_specific_attributes={ + _GEN_AI_REQUEST_MODEL: "cohere.command-r-v1:0", + _GEN_AI_REQUEST_MAX_TOKENS: 512, + _GEN_AI_REQUEST_TEMPERATURE: 0.5, + _GEN_AI_REQUEST_TOP_P: 0.65, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["COMPLETE"], + _GEN_AI_USAGE_INPUT_TOKENS: math.ceil( + len("Describe the purpose of a 'hello world' program in one line.") / 6 + ), + _GEN_AI_USAGE_OUTPUT_TOKENS: math.ceil(len("test-generation-text") / 6), + }, + span_name="Bedrock Runtime.InvokeModel", + ) + + def test_bedrock_runtime_invoke_model_ai21_jamba(self): + self.do_test_requests( + "bedrock/invokemodel/invoke-model/ai21.jamba-1-5-large-v1:0", + "GET", + 200, + 0, + 0, + rpc_service="Bedrock Runtime", + remote_service="AWS::BedrockRuntime", + remote_operation="InvokeModel", + remote_resource_type="AWS::Bedrock::Model", + remote_resource_identifier="ai21.jamba-1-5-large-v1:0", + cloudformation_primary_identifier="ai21.jamba-1-5-large-v1:0", + request_specific_attributes={ + _GEN_AI_REQUEST_MODEL: "ai21.jamba-1-5-large-v1:0", + _GEN_AI_REQUEST_MAX_TOKENS: 512, + _GEN_AI_REQUEST_TEMPERATURE: 0.6, + _GEN_AI_REQUEST_TOP_P: 0.8, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"], + _GEN_AI_USAGE_INPUT_TOKENS: 21, + _GEN_AI_USAGE_OUTPUT_TOKENS: 24, + }, + span_name="Bedrock Runtime.InvokeModel", + ) + + def test_bedrock_runtime_invoke_model_mistral(self): + self.do_test_requests( + "bedrock/invokemodel/invoke-model/mistral.mistral-7b-instruct-v0:2", + "GET", + 200, + 0, + 0, + rpc_service="Bedrock Runtime", + remote_service="AWS::BedrockRuntime", + remote_operation="InvokeModel", + remote_resource_type="AWS::Bedrock::Model", + remote_resource_identifier="mistral.mistral-7b-instruct-v0:2", + cloudformation_primary_identifier="mistral.mistral-7b-instruct-v0:2", + request_specific_attributes={ + _GEN_AI_REQUEST_MODEL: "mistral.mistral-7b-instruct-v0:2", + _GEN_AI_REQUEST_MAX_TOKENS: 4096, + _GEN_AI_REQUEST_TEMPERATURE: 0.75, + _GEN_AI_REQUEST_TOP_P: 0.99, + }, + response_specific_attributes={ + _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"], + _GEN_AI_USAGE_INPUT_TOKENS: math.ceil( + len("Describe the purpose of a 'hello world' program in one line.") / 6 + ), + _GEN_AI_USAGE_OUTPUT_TOKENS: math.ceil(len("test-output-text") / 6), }, span_name="Bedrock Runtime.InvokeModel", ) @@ -772,21 +928,23 @@ def _assert_semantic_conventions_attributes( # TODO: botocore instrumentation is not respecting PEER_SERVICE # self._assert_str_attribute(attributes_dict, SpanAttributes.PEER_SERVICE, "backend:8080") for key, value in request_specific_attributes.items(): - if isinstance(value, str): - self._assert_str_attribute(attributes_dict, key, value) - elif isinstance(value, int): - self._assert_int_attribute(attributes_dict, key, value) - else: - self._assert_array_value_ddb_table_name(attributes_dict, key, value) + self._assert_attribute(attributes_dict, key, value) + for key, value in response_specific_attributes.items(): + self._assert_attribute(attributes_dict, key, value) + + def _assert_attribute(self, attributes_dict: Dict[str, AnyValue], key, value) -> None: + if isinstance(value, str): if self._is_valid_regex(value): self._assert_match_attribute(attributes_dict, key, value) - elif isinstance(value, str): - self._assert_str_attribute(attributes_dict, key, value) - elif isinstance(value, int): - self._assert_int_attribute(attributes_dict, key, value) else: - self._assert_array_value_ddb_table_name(attributes_dict, key, value) + self._assert_str_attribute(attributes_dict, key, value) + elif isinstance(value, int): + self._assert_int_attribute(attributes_dict, key, value) + elif isinstance(value, float): + self._assert_float_attribute(attributes_dict, key, value) + else: + self._assert_array_value_ddb_table_name(attributes_dict, key, value) @override def _assert_metric_attributes(