From 642427ec514387452df5030598b83ab686211442 Mon Sep 17 00:00:00 2001
From: Steve Liu <liustve@amazon.com>
Date: Thu, 21 Nov 2024 17:25:54 -0800
Subject: [PATCH] feat: Add Contract Tests for new Gen AI attributes for
 foundational models (#292)

contract tests for new gen_ai inference parameters added in


https://github.com/aws-observability/aws-otel-python-instrumentation/pull/290

<img width="1563" alt="image"
src="https://github.com/user-attachments/assets/3ea5979d-43b2-43d6-8730-708855969d8a">

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

---------

Co-authored-by: Michael He <53622546+yiyuan-he@users.noreply.github.com>
---
 .../applications/botocore/botocore_server.py  | 164 +++++++++++++--
 .../test/amazon/base/contract_test_base.py    |   8 +-
 .../test/amazon/botocore/botocore_test.py     | 186 ++++++++++++++++--
 3 files changed, 327 insertions(+), 31 deletions(-)
diff --git a/contract-tests/images/applications/botocore/botocore_server.py b/contract-tests/images/applications/botocore/botocore_server.py
index f16948390..d1736d56c 100644
--- a/contract-tests/images/applications/botocore/botocore_server.py
+++ b/contract-tests/images/applications/botocore/botocore_server.py
@@ -6,6 +6,7 @@
 import tempfile
 from collections import namedtuple
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from io import BytesIO
 from threading import Thread
 
 import boto3
@@ -13,6 +14,7 @@
 from botocore.client import BaseClient
 from botocore.config import Config
 from botocore.exceptions import ClientError
+from botocore.response import StreamingBody
 from typing_extensions import Tuple, override
 
 _PORT: int = 8080
@@ -285,28 +287,22 @@ def _handle_bedrock_request(self) -> None:
                 },
             )
         elif self.in_path("invokemodel/invoke-model"):
+            model_id, request_body, response_body = get_model_request_response(self.path)
+
             set_main_status(200)
             bedrock_runtime_client.meta.events.register(
                 "before-call.bedrock-runtime.InvokeModel",
-                inject_200_success,
-            )
-            model_id = "amazon.titan-text-premier-v1:0"
-            user_message = "Describe the purpose of a 'hello world' program in one line."
-            prompt = f"<s>[INST] {user_message} [/INST]"
-            body = json.dumps(
-                {
-                    "inputText": prompt,
-                    "textGenerationConfig": {
-                        "maxTokenCount": 3072,
-                        "stopSequences": [],
-                        "temperature": 0.7,
-                        "topP": 0.9,
-                    },
-                }
+                lambda **kwargs: inject_200_success(
+                    modelId=model_id,
+                    body=response_body,
+                    **kwargs,
+                ),
             )
             accept = "application/json"
             content_type = "application/json"
-            bedrock_runtime_client.invoke_model(body=body, modelId=model_id, accept=accept, contentType=content_type)
+            bedrock_runtime_client.invoke_model(
+                body=request_body, modelId=model_id, accept=accept, contentType=content_type
+            )
         else:
             set_main_status(404)
 
@@ -378,6 +374,137 @@ def _end_request(self, status_code: int):
         self.end_headers()
 
 
+def get_model_request_response(path):
+    prompt = "Describe the purpose of a 'hello world' program in one line."
+    model_id = ""
+    request_body = {}
+    response_body = {}
+
+    if "amazon.titan" in path:
+        model_id = "amazon.titan-text-premier-v1:0"
+
+        request_body = {
+            "inputText": prompt,
+            "textGenerationConfig": {
+                "maxTokenCount": 3072,
+                "stopSequences": [],
+                "temperature": 0.7,
+                "topP": 0.9,
+            },
+        }
+
+        response_body = {
+            "inputTextTokenCount": 15,
+            "results": [
+                {
+                    "tokenCount": 13,
+                    "outputText": "text-test-response",
+                    "completionReason": "CONTENT_FILTERED",
+                },
+            ],
+        }
+
+    if "anthropic.claude" in path:
+        model_id = "anthropic.claude-v2:1"
+
+        request_body = {
+            "anthropic_version": "bedrock-2023-05-31",
+            "max_tokens": 1000,
+            "temperature": 0.99,
+            "top_p": 1,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": prompt}],
+                },
+            ],
+        }
+
+        response_body = {
+            "stop_reason": "end_turn",
+            "usage": {
+                "input_tokens": 15,
+                "output_tokens": 13,
+            },
+        }
+
+    if "meta.llama" in path:
+        model_id = "meta.llama2-13b-chat-v1"
+
+        request_body = {"prompt": prompt, "max_gen_len": 512, "temperature": 0.5, "top_p": 0.9}
+
+        response_body = {"prompt_token_count": 31, "generation_token_count": 49, "stop_reason": "stop"}
+
+    if "cohere.command" in path:
+        model_id = "cohere.command-r-v1:0"
+
+        request_body = {
+            "chat_history": [],
+            "message": prompt,
+            "max_tokens": 512,
+            "temperature": 0.5,
+            "p": 0.65,
+        }
+
+        response_body = {
+            "chat_history": [
+                {"role": "USER", "message": prompt},
+                {"role": "CHATBOT", "message": "test-text-output"},
+            ],
+            "finish_reason": "COMPLETE",
+            "text": "test-generation-text",
+        }
+
+    if "ai21.jamba" in path:
+        model_id = "ai21.jamba-1-5-large-v1:0"
+
+        request_body = {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ],
+            "top_p": 0.8,
+            "temperature": 0.6,
+            "max_tokens": 512,
+        }
+
+        response_body = {
+            "stop_reason": "end_turn",
+            "usage": {
+                "prompt_tokens": 21,
+                "completion_tokens": 24,
+            },
+            "choices": [
+                {"finish_reason": "stop"},
+            ],
+        }
+
+    if "mistral" in path:
+        model_id = "mistral.mistral-7b-instruct-v0:2"
+
+        request_body = {
+            "prompt": prompt,
+            "max_tokens": 4096,
+            "temperature": 0.75,
+            "top_p": 0.99,
+        }
+
+        response_body = {
+            "outputs": [
+                {
+                    "text": "test-output-text",
+                    "stop_reason": "stop",
+                },
+            ]
+        }
+
+    json_bytes = json.dumps(response_body).encode("utf-8")
+
+    return model_id, json.dumps(request_body), StreamingBody(BytesIO(json_bytes), len(json_bytes))
+
+
 def set_main_status(status: int) -> None:
     RequestHandler.main_status = status
 
@@ -490,11 +617,16 @@ def inject_200_success(**kwargs):
     guardrail_arn = kwargs.get("guardrailArn")
     if guardrail_arn is not None:
         response_body["guardrailArn"] = guardrail_arn
+    model_id = kwargs.get("modelId")
+    if model_id is not None:
+        response_body["modelId"] = model_id
 
     HTTPResponse = namedtuple("HTTPResponse", ["status_code", "headers", "body"])
     headers = kwargs.get("headers", {})
     body = kwargs.get("body", "")
+    response_body["body"] = body
     http_response = HTTPResponse(200, headers=headers, body=body)
+
     return http_response, response_body
 
 
diff --git a/contract-tests/tests/test/amazon/base/contract_test_base.py b/contract-tests/tests/test/amazon/base/contract_test_base.py
index ba96530b0..64569450b 100644
--- a/contract-tests/tests/test/amazon/base/contract_test_base.py
+++ b/contract-tests/tests/test/amazon/base/contract_test_base.py
@@ -173,6 +173,12 @@ def _assert_int_attribute(self, attributes_dict: Dict[str, AnyValue], key: str,
         self.assertIsNotNone(actual_value)
         self.assertEqual(expected_value, actual_value.int_value)
 
+    def _assert_float_attribute(self, attributes_dict: Dict[str, AnyValue], key: str, expected_value: float) -> None:
+        self.assertIn(key, attributes_dict)
+        actual_value: AnyValue = attributes_dict[key]
+        self.assertIsNotNone(actual_value)
+        self.assertEqual(expected_value, actual_value.double_value)
+
     def _assert_match_attribute(self, attributes_dict: Dict[str, AnyValue], key: str, pattern: str) -> None:
         self.assertIn(key, attributes_dict)
         actual_value: AnyValue = attributes_dict[key]
@@ -237,5 +243,5 @@ def _is_valid_regex(self, pattern: str) -> bool:
         try:
             re.compile(pattern)
             return True
-        except re.error:
+        except (re.error, StopIteration, RuntimeError, KeyError):
             return False
diff --git a/contract-tests/tests/test/amazon/botocore/botocore_test.py b/contract-tests/tests/test/amazon/botocore/botocore_test.py
index f5ae91a59..b2821a8b6 100644
--- a/contract-tests/tests/test/amazon/botocore/botocore_test.py
+++ b/contract-tests/tests/test/amazon/botocore/botocore_test.py
@@ -1,5 +1,6 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+import math
 from logging import INFO, Logger, getLogger
 from typing import Dict, List
 
@@ -34,13 +35,21 @@
 _AWS_BEDROCK_GUARDRAIL_ID: str = "aws.bedrock.guardrail.id"
 _AWS_BEDROCK_KNOWLEDGE_BASE_ID: str = "aws.bedrock.knowledge_base.id"
 _AWS_BEDROCK_DATA_SOURCE_ID: str = "aws.bedrock.data_source.id"
+
 _GEN_AI_REQUEST_MODEL: str = "gen_ai.request.model"
+_GEN_AI_REQUEST_TEMPERATURE: str = "gen_ai.request.temperature"
+_GEN_AI_REQUEST_TOP_P: str = "gen_ai.request.top_p"
+_GEN_AI_REQUEST_MAX_TOKENS: str = "gen_ai.request.max_tokens"
+_GEN_AI_RESPONSE_FINISH_REASONS: str = "gen_ai.response.finish_reasons"
+_GEN_AI_USAGE_INPUT_TOKENS: str = "gen_ai.usage.input_tokens"
+_GEN_AI_USAGE_OUTPUT_TOKENS: str = "gen_ai.usage.output_tokens"
+
 _AWS_SECRET_ARN: str = "aws.secretsmanager.secret.arn"
 _AWS_STATE_MACHINE_ARN: str = "aws.stepfunctions.state_machine.arn"
 _AWS_ACTIVITY_ARN: str = "aws.stepfunctions.activity.arn"
 
 
-# pylint: disable=too-many-public-methods
+# pylint: disable=too-many-public-methods,too-many-lines
 class BotocoreTest(ContractTestBase):
     _local_stack: LocalStackContainer
 
@@ -403,9 +412,9 @@ def test_kinesis_fault(self):
             span_name="Kinesis.PutRecord",
         )
 
-    def test_bedrock_runtime_invoke_model(self):
+    def test_bedrock_runtime_invoke_model_amazon_titan(self):
         self.do_test_requests(
-            "bedrock/invokemodel/invoke-model",
+            "bedrock/invokemodel/invoke-model/amazon.titan-text-premier-v1:0",
             "GET",
             200,
             0,
@@ -418,6 +427,153 @@ def test_bedrock_runtime_invoke_model(self):
             cloudformation_primary_identifier="amazon.titan-text-premier-v1:0",
             request_specific_attributes={
                 _GEN_AI_REQUEST_MODEL: "amazon.titan-text-premier-v1:0",
+                _GEN_AI_REQUEST_MAX_TOKENS: 3072,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.7,
+                _GEN_AI_REQUEST_TOP_P: 0.9,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["CONTENT_FILTERED"],
+                _GEN_AI_USAGE_INPUT_TOKENS: 15,
+                _GEN_AI_USAGE_OUTPUT_TOKENS: 13,
+            },
+            span_name="Bedrock Runtime.InvokeModel",
+        )
+
+    def test_bedrock_runtime_invoke_model_anthropic_claude(self):
+        self.do_test_requests(
+            "bedrock/invokemodel/invoke-model/anthropic.claude-v2:1",
+            "GET",
+            200,
+            0,
+            0,
+            rpc_service="Bedrock Runtime",
+            remote_service="AWS::BedrockRuntime",
+            remote_operation="InvokeModel",
+            remote_resource_type="AWS::Bedrock::Model",
+            remote_resource_identifier="anthropic.claude-v2:1",
+            cloudformation_primary_identifier="anthropic.claude-v2:1",
+            request_specific_attributes={
+                _GEN_AI_REQUEST_MODEL: "anthropic.claude-v2:1",
+                _GEN_AI_REQUEST_MAX_TOKENS: 1000,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.99,
+                _GEN_AI_REQUEST_TOP_P: 1,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["end_turn"],
+                _GEN_AI_USAGE_INPUT_TOKENS: 15,
+                _GEN_AI_USAGE_OUTPUT_TOKENS: 13,
+            },
+            span_name="Bedrock Runtime.InvokeModel",
+        )
+
+    def test_bedrock_runtime_invoke_model_meta_llama(self):
+        self.do_test_requests(
+            "bedrock/invokemodel/invoke-model/meta.llama2-13b-chat-v1",
+            "GET",
+            200,
+            0,
+            0,
+            rpc_service="Bedrock Runtime",
+            remote_service="AWS::BedrockRuntime",
+            remote_operation="InvokeModel",
+            remote_resource_type="AWS::Bedrock::Model",
+            remote_resource_identifier="meta.llama2-13b-chat-v1",
+            cloudformation_primary_identifier="meta.llama2-13b-chat-v1",
+            request_specific_attributes={
+                _GEN_AI_REQUEST_MODEL: "meta.llama2-13b-chat-v1",
+                _GEN_AI_REQUEST_MAX_TOKENS: 512,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.5,
+                _GEN_AI_REQUEST_TOP_P: 0.9,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"],
+                _GEN_AI_USAGE_INPUT_TOKENS: 31,
+                _GEN_AI_USAGE_OUTPUT_TOKENS: 49,
+            },
+            span_name="Bedrock Runtime.InvokeModel",
+        )
+
+    def test_bedrock_runtime_invoke_model_cohere_command(self):
+        self.do_test_requests(
+            "bedrock/invokemodel/invoke-model/cohere.command-r-v1:0",
+            "GET",
+            200,
+            0,
+            0,
+            rpc_service="Bedrock Runtime",
+            remote_service="AWS::BedrockRuntime",
+            remote_operation="InvokeModel",
+            remote_resource_type="AWS::Bedrock::Model",
+            remote_resource_identifier="cohere.command-r-v1:0",
+            cloudformation_primary_identifier="cohere.command-r-v1:0",
+            request_specific_attributes={
+                _GEN_AI_REQUEST_MODEL: "cohere.command-r-v1:0",
+                _GEN_AI_REQUEST_MAX_TOKENS: 512,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.5,
+                _GEN_AI_REQUEST_TOP_P: 0.65,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["COMPLETE"],
+                _GEN_AI_USAGE_INPUT_TOKENS: math.ceil(
+                    len("Describe the purpose of a 'hello world' program in one line.") / 6
+                ),
+                _GEN_AI_USAGE_OUTPUT_TOKENS: math.ceil(len("test-generation-text") / 6),
+            },
+            span_name="Bedrock Runtime.InvokeModel",
+        )
+
+    def test_bedrock_runtime_invoke_model_ai21_jamba(self):
+        self.do_test_requests(
+            "bedrock/invokemodel/invoke-model/ai21.jamba-1-5-large-v1:0",
+            "GET",
+            200,
+            0,
+            0,
+            rpc_service="Bedrock Runtime",
+            remote_service="AWS::BedrockRuntime",
+            remote_operation="InvokeModel",
+            remote_resource_type="AWS::Bedrock::Model",
+            remote_resource_identifier="ai21.jamba-1-5-large-v1:0",
+            cloudformation_primary_identifier="ai21.jamba-1-5-large-v1:0",
+            request_specific_attributes={
+                _GEN_AI_REQUEST_MODEL: "ai21.jamba-1-5-large-v1:0",
+                _GEN_AI_REQUEST_MAX_TOKENS: 512,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.6,
+                _GEN_AI_REQUEST_TOP_P: 0.8,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"],
+                _GEN_AI_USAGE_INPUT_TOKENS: 21,
+                _GEN_AI_USAGE_OUTPUT_TOKENS: 24,
+            },
+            span_name="Bedrock Runtime.InvokeModel",
+        )
+
+    def test_bedrock_runtime_invoke_model_mistral(self):
+        self.do_test_requests(
+            "bedrock/invokemodel/invoke-model/mistral.mistral-7b-instruct-v0:2",
+            "GET",
+            200,
+            0,
+            0,
+            rpc_service="Bedrock Runtime",
+            remote_service="AWS::BedrockRuntime",
+            remote_operation="InvokeModel",
+            remote_resource_type="AWS::Bedrock::Model",
+            remote_resource_identifier="mistral.mistral-7b-instruct-v0:2",
+            cloudformation_primary_identifier="mistral.mistral-7b-instruct-v0:2",
+            request_specific_attributes={
+                _GEN_AI_REQUEST_MODEL: "mistral.mistral-7b-instruct-v0:2",
+                _GEN_AI_REQUEST_MAX_TOKENS: 4096,
+                _GEN_AI_REQUEST_TEMPERATURE: 0.75,
+                _GEN_AI_REQUEST_TOP_P: 0.99,
+            },
+            response_specific_attributes={
+                _GEN_AI_RESPONSE_FINISH_REASONS: ["stop"],
+                _GEN_AI_USAGE_INPUT_TOKENS: math.ceil(
+                    len("Describe the purpose of a 'hello world' program in one line.") / 6
+                ),
+                _GEN_AI_USAGE_OUTPUT_TOKENS: math.ceil(len("test-output-text") / 6),
             },
             span_name="Bedrock Runtime.InvokeModel",
         )
@@ -772,21 +928,23 @@ def _assert_semantic_conventions_attributes(
         # TODO: botocore instrumentation is not respecting PEER_SERVICE
         # self._assert_str_attribute(attributes_dict, SpanAttributes.PEER_SERVICE, "backend:8080")
         for key, value in request_specific_attributes.items():
-            if isinstance(value, str):
-                self._assert_str_attribute(attributes_dict, key, value)
-            elif isinstance(value, int):
-                self._assert_int_attribute(attributes_dict, key, value)
-            else:
-                self._assert_array_value_ddb_table_name(attributes_dict, key, value)
+            self._assert_attribute(attributes_dict, key, value)
+
         for key, value in response_specific_attributes.items():
+            self._assert_attribute(attributes_dict, key, value)
+
+    def _assert_attribute(self, attributes_dict: Dict[str, AnyValue], key, value) -> None:
+        if isinstance(value, str):
             if self._is_valid_regex(value):
                 self._assert_match_attribute(attributes_dict, key, value)
-            elif isinstance(value, str):
-                self._assert_str_attribute(attributes_dict, key, value)
-            elif isinstance(value, int):
-                self._assert_int_attribute(attributes_dict, key, value)
             else:
-                self._assert_array_value_ddb_table_name(attributes_dict, key, value)
+                self._assert_str_attribute(attributes_dict, key, value)
+        elif isinstance(value, int):
+            self._assert_int_attribute(attributes_dict, key, value)
+        elif isinstance(value, float):
+            self._assert_float_attribute(attributes_dict, key, value)
+        else:
+            self._assert_array_value_ddb_table_name(attributes_dict, key, value)
 
     @override
     def _assert_metric_attributes(