image component sent in "image_url" field instead of HTML tag

triton-inference-server · Jun 27, 2024 · a400418 · a400418
1 parent 20fe487
commit a400418
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 10 deletions.
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -382,14 +382,18 @@ def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict:
     def _add_vision_input(
         cls, generic_dataset_json: Dict[str, List[Dict]], vision_filename: Path
     ) -> Dict[str, List[Dict]]:
+        img_base64 = encode_image(vision_filename)
         for row in generic_dataset_json["rows"]:
-            img_base64 = encode_image(vision_filename)
-            row["text_input"] += (
-                "\n"
-                "What two words from the text above describes the image the best?"
-                " Explain your choice.\n"
-                f'<img src="data:image/png;base64,{img_base64}"/>'
-            )
+            if isinstance(row['text_input'], str):
+                row['text_input'] = [dict(
+                    type='text',
+                    text=row['text_input'],
+                )]
+
+            row['text_input'].append(dict(
+                type='image_url',
+                image_url=f'data:image/jpeg;base64,{img_base64}',
+            ))
 
         return generic_dataset_json
 

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
@@ -546,7 +546,10 @@ def _get_openai_input_text(self, req_inputs: dict) -> str:
         if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS:
             return payload["messages"][0]["content"]
         elif self._response_format == ResponseFormat.OPENAI_VISION:
-            return payload["messages"][0]["content"]
+            content = payload["messages"][0]["content"]
+            if isinstance(content, str):
+                content = [dict(type='text', text=content)]
+            return ' '.join(c['text'] for c in content if c['type'] == 'text')
         elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS:
             return payload["prompt"]
         else:

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py
@@ -539,7 +539,11 @@ def test_extra_inputs(
                         item[input_name] == input_value
                     ), f"The value of {input_name} is incorrect"
                     if output_format == OutputFormat.OPENAI_VISION:
-                        assert "<img src" in item["messages"][-1]["content"], item
+                        assert any(
+                            isinstance(c, dict)
+                            and c['type'] == 'image_url'
+                            for c in item['messages'][-1]['content']
+                        )
         elif (
             output_format == OutputFormat.TENSORRTLLM
             or output_format == OutputFormat.VLLM

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
@@ -668,7 +668,7 @@ def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None:
                     {
                         "timestamp": 1,
                         "request_inputs": {
-                            "payload": '{"messages":[{"role":"user","content":"This is test"}],"stream":true}',
+                            "payload": '{"messages":[{"role":"user","content":[{"type":"text","text":"This is test"}]}],"stream":true}',
                         },
                         # the first, and the last two responses will be ignored because they have no "content"
                         "response_timestamps": [3, 5, 8, 12, 13, 14],