Variable number of images per prompt (#765)

* variable number of images per prompt * add TODO comment Co-authored-by: Hyunjae Woo <[email protected]> * apply code review suggestions * Update src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py Co-authored-by: Hyunjae Woo <[email protected]> --------- Co-authored-by: Hyunjae Woo <[email protected]>
triton-inference-server · Jul 30, 2024 · ed0312e · ed0312e
1 parent ebafa2d
commit ed0312e
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 21 deletions.
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -86,6 +86,8 @@ class LlmInputs:
     DEFAULT_IMAGE_WIDTH_STDDEV = 0
     DEFAULT_IMAGE_HEIGHT_MEAN = 100
     DEFAULT_IMAGE_HEIGHT_STDDEV = 0
+    DEFAULT_IMAGES_COUNT_MIN = 0
+    DEFAULT_IMAGES_COUNT_MAX = 1
 
     EMPTY_JSON_IN_VLLM_PA_FORMAT: Dict = {"data": []}
     EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT: Dict = {"data": []}
@@ -114,6 +116,8 @@ def create_llm_inputs(
         image_height_mean: int = DEFAULT_IMAGE_HEIGHT_MEAN,
         image_height_stddev: int = DEFAULT_IMAGE_HEIGHT_STDDEV,
         image_format: ImageFormat = ImageFormat.PNG,
+        images_count_min: int = DEFAULT_IMAGES_COUNT_MIN,
+        images_count_max: int = DEFAULT_IMAGES_COUNT_MAX,
         random_seed: int = DEFAULT_RANDOM_SEED,
         num_of_output_prompts: int = DEFAULT_NUM_PROMPTS,
         add_model_name: bool = False,
@@ -166,6 +170,10 @@ def create_llm_inputs(
             The standard deviation of height of images when generating synthetic image data.
         image_format:
             The compression format of the images.
+        images_count_min:
+            Minimum number of synthetic images to be added to a prompt.
+        images_count_max:
+            Maximum number of synthetic images to be added to a prompt.
         batch_size:
             The number of inputs per request (currently only used for the embeddings and rankings endpoints)
 
@@ -207,6 +215,8 @@ def create_llm_inputs(
             image_height_mean,
             image_height_stddev,
             image_format,
+            images_count_min,
+            images_count_max,
             batch_size,
             input_filename,
         )
@@ -247,6 +257,8 @@ def get_generic_dataset_json(
         image_height_mean: int,
         image_height_stddev: int,
         image_format: ImageFormat,
+        images_count_min: int,
+        images_count_max: int,
         batch_size: int,
         input_filename: Optional[Path],
     ) -> Dict:
@@ -283,6 +295,10 @@ def get_generic_dataset_json(
             The standard deviation of height of images when generating synthetic image data.
         image_format:
             The compression format of the images.
+        images_count_min:
+            Minimum number of synthetic images to be added to a prompt.
+        images_count_max:
+            Maximum number of synthetic images to be added to a prompt.
         batch_size:
             The number of inputs per request (currently only used for the embeddings and rankings endpoints)
         input_filename:
@@ -350,6 +366,8 @@ def get_generic_dataset_json(
                     image_height_mean,
                     image_height_stddev,
                     image_format,
+                    images_count_min,
+                    images_count_max,
                     output_format,
                 )
                 generic_dataset_json = (
@@ -480,6 +498,8 @@ def _get_input_dataset_from_synthetic(
         image_height_mean: int,
         image_height_stddev: int,
         image_format: ImageFormat,
+        images_count_min: int,
+        images_count_max: int,
         output_format: OutputFormat,
     ) -> Dict[str, Any]:
         dataset_json: Dict[str, Any] = {}
@@ -495,14 +515,18 @@ def _get_input_dataset_from_synthetic(
             row["row"]["text_input"] = synthetic_prompt
 
             if output_format == OutputFormat.OPENAI_VISION:
-                synthetic_image = cls._create_synthetic_image(
-                    image_width_mean=image_width_mean,
-                    image_width_stddev=image_width_stddev,
-                    image_height_mean=image_height_mean,
-                    image_height_stddev=image_height_stddev,
-                    image_format=image_format,
-                )
-                row["row"]["image"] = synthetic_image
+                N = random.randint(images_count_min, images_count_max)
+                synthetic_images = [
+                    cls._create_synthetic_image(
+                        image_width_mean=image_width_mean,
+                        image_width_stddev=image_width_stddev,
+                        image_height_mean=image_height_mean,
+                        image_height_stddev=image_height_stddev,
+                        image_format=image_format,
+                    )
+                    for _ in range(N)
+                ]
+                row["row"]["images"] = synthetic_images
 
             dataset_json["rows"].append(row)
 
@@ -607,8 +631,9 @@ def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict:
         dataset_json["features"] = [{"name": "text_input"}]
         dataset_json["rows"] = []
         for prompt, image in zip(prompts, images):
-            content = {"text_input": prompt}
-            content.update({"image": image} if image else {})
+            # (TMA-2004) support variable images per request through input file
+            content: Dict[str, Any] = {"text_input": prompt}
+            content.update({"images": [image]} if image else {})
             dataset_json["rows"].append({"row": content})
 
         return dataset_json
@@ -652,16 +677,19 @@ def _convert_to_openai_multi_modal_content(
         Converts to multi-modal content format of OpenAI Chat Completions API.
         """
         for row in generic_dataset_json["rows"]:
-            if row["image"]:
+            if row["images"]:
                 row["text_input"] = [
                     {
                         "type": "text",
                         "text": row["text_input"],
                     },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": row["image"]},
-                    },
+                    *[
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image},
+                        }
+                        for image in row["images"]
+                    ],
                 ]
 
         return generic_dataset_json

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -131,6 +131,12 @@ def _check_image_input_args(
         parser.error(
             "Both --image-width-stddev and --image-height-stddev values must be non-negative."
         )
+    if args.images_count_min < 0:
+        parser.error("--images-count-min must be a non-negative integer.")
+    if args.images_count_max < args.images_count_min:
+        parser.error(
+            "--images-count-max must be greater than or equal to --images-count-min."
+        )
 
     args = _convert_str_to_enum_entry(args, "image_format", ImageFormat)
     return args
@@ -481,6 +487,22 @@ def _add_image_input_args(parser):
         "If format is not selected, format of generated image is selected at random",
     )
 
+    input_group.add_argument(
+        "--images-count-min",
+        type=int,
+        default=LlmInputs.DEFAULT_IMAGES_COUNT_MIN,
+        required=False,
+        help=f"Minimum number of synthetic images to be added to a prompt.",
+    )
+
+    input_group.add_argument(
+        "--images-count-max",
+        type=int,
+        default=LlmInputs.DEFAULT_IMAGES_COUNT_MAX,
+        required=False,
+        help=f"Maximum number of synthetic images to be added to a prompt.",
+    )
+
 
 def _add_profile_args(parser):
     profile_group = parser.add_argument_group("Profiling")

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
@@ -220,6 +220,10 @@ def _get_openai_input_text(self, req_inputs: dict) -> str:
             return payload["prompt"]
         elif self._response_format == ResponseFormat.OPENAI_VISION:
             content = payload["messages"][0]["content"]
+            # When no images were included in the request input, the content
+            # is same as text-only chat completions format (e.g. string).
+            if isinstance(content, str):
+                return content
             return " ".join(c["text"] for c in content if c["type"] == "text")
         else:
             raise ValueError(

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
@@ -98,6 +98,8 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
             "image_height_mean",
             "image_height_stddev",
             "image_format",
+            "images_count_min",
+            "images_count_max",
         ]
 
         utils.remove_file(args.profile_export_file)

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py
@@ -234,6 +234,10 @@ def test_help_version_arguments_output_and_exit(
                 {"image_height_stddev": 456},
             ),
             (["--image-format", "png"], {"image_format": ImageFormat.PNG}),
+            (
+                ["--images-count-min", "123", "--images-count-max", "321"],
+                {"images_count_min": 123, "images_count_max": 321},
+            ),
             (["-v"], {"verbose": True}),
             (["--verbose"], {"verbose": True}),
             (["-u", "test_url"], {"u": "test_url"}),

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py
@@ -254,6 +254,8 @@ def test_generate_json(self, monkeypatch) -> None:
           "image_height_mean": 100,
           "image_height_stddev": 0,
           "image_format": null,
+          "images_count_min": 0,
+          "images_count_max": 1,
           "concurrency": 1,
           "measurement_interval": 10000,
           "request_rate": null,

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py
@@ -557,8 +557,8 @@ def test_llm_inputs_with_defaults(self, default_configured_url):
     def test_add_image_inputs_openai_vision(self) -> None:
         generic_json = {
             "rows": [
-                {"text_input": "test input one", "image": "test_image1"},
-                {"text_input": "test input two", "image": "test_image2"},
+                {"text_input": "test input one", "images": ["test_image1"]},
+                {"text_input": "test input two", "images": ["test_image2"]},
             ]
         }
 
@@ -608,8 +608,12 @@ def test_add_image_inputs_openai_vision(self) -> None:
             OutputFormat.TENSORRTLLM,
         ],
     )
+    @pytest.mark.parametrize(
+        "images_count",
+        [0, 5],
+    )
     def test_get_input_dataset_from_synthetic(
-        self, mock_prompt, mock_image, output_format
+        self, mock_prompt, mock_image, output_format, images_count
     ) -> None:
         _placeholder = 123  # dummy value
         num_prompts = 3
@@ -624,6 +628,8 @@ def test_get_input_dataset_from_synthetic(
             image_height_mean=_placeholder,
             image_height_stddev=_placeholder,
             image_format=ImageFormat.PNG,
+            images_count_min=images_count,
+            images_count_max=images_count,
             output_format=output_format,
         )
 
@@ -635,7 +641,7 @@ def test_get_input_dataset_from_synthetic(
             if output_format == OutputFormat.OPENAI_VISION:
                 assert row == {
                     "text_input": "This is test prompt",
-                    "image": "test_image_base64",
+                    "images": images_count * ["test_image_base64"],
                 }
             else:
                 assert row == {
@@ -805,7 +811,7 @@ def test_get_input_file_with_multi_modal_data(
         assert len(dataset["rows"]) == len(expected_data)
         for i, data in enumerate(expected_data):
             assert dataset["rows"][i]["row"]["text_input"] == data.text_input
-            assert dataset["rows"][i]["row"]["image"] == data.image
+            assert dataset["rows"][i]["row"]["images"] == [data.image]
 
     @pytest.mark.parametrize(
         "seed, model_name_list, index,model_selection_strategy,expected_model",

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_profile_data_parser.py
@@ -649,7 +649,7 @@ def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None:
                     {
                         "timestamp": 2,
                         "request_inputs": {
-                            "payload": '{"messages":[{"role":"user","content":[{"type":"text","text":"This is test too"},{"type":"image_url","image_url":{"url":"data:image/png;base64,abcdef"}}]}],"model":"llava-1.6","stream":true}',
+                            "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llava-1.6","stream":true}',
                         },
                         # the first, and the last two responses will be ignored because they have no "content"
                         "response_timestamps": [4, 7, 11, 15, 18, 19],