Support Vision Language Model in GenAI-Perf (#756)

* POC LLaVA VLM support (#720) * POC for LLaVA support * non-streaming request in VLM tests * image component sent in "image_url" field instead of HTML tag * generate sample image instead of loading from docs * add vision to endpoint mapping * fixes for handling OutputFormat * refactor - extract image preparation to a separate module * fixes to the refactor * replace match-case syntax with if-elseif-else * Update image payload format and fix tests * Few clean ups and tickets added for follow up tasks * Fix and add tests for vision format * Remove output format from profile data parser * Revert irrelevant code change * Revert changes * Remove unused dependency * Comment test_extra_inputs --------- Co-authored-by: Hyunjae Woo <[email protected]> * Support multi-modal input from file for OpenAI Chat Completions (#749) * add synthetic image generator (#751) * synthetic image generator * format randomization * images should be base64-encoded arbitrarly * randomized image format * randomized image shape * prepare SyntheticImageGenerator to support different image sources * read from files * python 3.10 support fixes * remove unused imports * skip sampled image sizes with negative values * formats type fix * remove unused variable * synthetic image generator encodes images to base64 * image format not randomized * sample each dimension independently Co-authored-by: Hyunjae Woo <[email protected]> * apply code-review suggestsions * update class name * deterministic synthetic image generator * add typing to SyntheticImageGenerator * SyntheticImageGenerator doesn't load files * SyntheticImageGenerator always encodes images to base64 * remove unused imports * generate gaussian noise instead of blank images --------- Co-authored-by: Hyunjae Woo <[email protected]> * Add command line arguments for synthetic image generation (#753) * Add CLI options for synthetic image generation * read image format from file when --input-file is used * move encode_image method to utils * Lazy import some modules * Support synthetic image generation in GenAI-Perf (#754) * support synthetic image generation for VLM model * add test * integrate sythetic image generator into LlmInputs * add source images for synthetic image data * use abs to get positive int --------- Co-authored-by: Marek Wawrzos <[email protected]>
triton-inference-server · Jul 18, 2024 · 30af885 · 30af885
1 parent e4d9ef0
commit 30af885
Show file tree

Hide file tree

Showing 20 changed files with 793 additions and 111 deletions.
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -20,11 +20,17 @@
 from typing import Any, Dict, List, Optional, Tuple, cast
 
 import requests
+from genai_perf import utils
 from genai_perf.constants import CNN_DAILY_MAIL, DEFAULT_INPUT_DATA_JSON, OPEN_ORCA
 from genai_perf.exceptions import GenAIPerfException
+from genai_perf.llm_inputs.synthetic_image_generator import (
+    ImageFormat,
+    SyntheticImageGenerator,
+)
 from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator
 from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer
 from genai_perf.utils import load_json_str
+from PIL import Image
 from requests import Response
 
 
@@ -43,6 +49,7 @@ class OutputFormat(Enum):
     OPENAI_CHAT_COMPLETIONS = auto()
     OPENAI_COMPLETIONS = auto()
     OPENAI_EMBEDDINGS = auto()
+    OPENAI_VISION = auto()
     RANKINGS = auto()
     TENSORRTLLM = auto()
     VLLM = auto()
@@ -75,6 +82,11 @@ class LlmInputs:
     DEFAULT_OUTPUT_TOKENS_STDDEV = 0
     DEFAULT_NUM_PROMPTS = 100
 
+    DEFAULT_IMAGE_WIDTH_MEAN = 100
+    DEFAULT_IMAGE_WIDTH_STDDEV = 0
+    DEFAULT_IMAGE_HEIGHT_MEAN = 100
+    DEFAULT_IMAGE_HEIGHT_STDDEV = 0
+
     EMPTY_JSON_IN_VLLM_PA_FORMAT: Dict = {"data": []}
     EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT: Dict = {"data": []}
     EMPTY_JSON_IN_OPENAI_PA_FORMAT: Dict = {"data": []}
@@ -97,6 +109,11 @@ def create_llm_inputs(
         output_tokens_deterministic: bool = False,
         prompt_tokens_mean: int = DEFAULT_PROMPT_TOKENS_MEAN,
         prompt_tokens_stddev: int = DEFAULT_PROMPT_TOKENS_STDDEV,
+        image_width_mean: int = DEFAULT_IMAGE_WIDTH_MEAN,
+        image_width_stddev: int = DEFAULT_IMAGE_WIDTH_STDDEV,
+        image_height_mean: int = DEFAULT_IMAGE_HEIGHT_MEAN,
+        image_height_stddev: int = DEFAULT_IMAGE_HEIGHT_STDDEV,
+        image_format: ImageFormat = ImageFormat.PNG,
         random_seed: int = DEFAULT_RANDOM_SEED,
         num_of_output_prompts: int = DEFAULT_NUM_PROMPTS,
         add_model_name: bool = False,
@@ -139,6 +156,16 @@ def create_llm_inputs(
             The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided.
         output_tokens_deterministic:
             If true, the output tokens will set the minimum and maximum tokens to be equivalent.
+        image_width_mean:
+            The mean width of images when generating synthetic image data.
+        image_width_stddev:
+            The standard deviation of width of images when generating synthetic image data.
+        image_height_mean:
+            The mean height of images when generating synthetic image data.
+        image_height_stddev:
+            The standard deviation of height of images when generating synthetic image data.
+        image_format:
+            The compression format of the images.
         batch_size:
             The number of inputs per request (currently only used for the embeddings and rankings endpoints)
 
@@ -175,6 +202,11 @@ def create_llm_inputs(
             prompt_tokens_mean,
             prompt_tokens_stddev,
             num_of_output_prompts,
+            image_width_mean,
+            image_width_stddev,
+            image_height_mean,
+            image_height_stddev,
+            image_format,
             batch_size,
             input_filename,
         )
@@ -210,6 +242,11 @@ def get_generic_dataset_json(
         prompt_tokens_mean: int,
         prompt_tokens_stddev: int,
         num_of_output_prompts: int,
+        image_width_mean: int,
+        image_width_stddev: int,
+        image_height_mean: int,
+        image_height_stddev: int,
+        image_format: ImageFormat,
         batch_size: int,
         input_filename: Optional[Path],
     ) -> Dict:
@@ -236,6 +273,16 @@ def get_generic_dataset_json(
             The standard deviation of the length of the prompt to generate
         num_of_output_prompts:
             The number of synthetic output prompts to generate
+        image_width_mean:
+            The mean width of images when generating synthetic image data.
+        image_width_stddev:
+            The standard deviation of width of images when generating synthetic image data.
+        image_height_mean:
+            The mean height of images when generating synthetic image data.
+        image_height_stddev:
+            The standard deviation of height of images when generating synthetic image data.
+        image_format:
+            The compression format of the images.
         batch_size:
             The number of inputs per request (currently only used for the embeddings and rankings endpoints)
         input_filename:
@@ -280,6 +327,12 @@ def get_generic_dataset_json(
             )
         else:
             if input_type == PromptSource.DATASET:
+                # (TMA-1990) support VLM input from public dataset
+                if output_format == OutputFormat.OPENAI_VISION:
+                    raise GenAIPerfException(
+                        f"{OutputFormat.OPENAI_VISION.to_lowercase()} currently "
+                        "does not support dataset as input."
+                    )
                 dataset = cls._get_input_dataset_from_url(
                     dataset_name, starting_index, length
                 )
@@ -292,6 +345,12 @@ def get_generic_dataset_json(
                     prompt_tokens_mean,
                     prompt_tokens_stddev,
                     num_of_output_prompts,
+                    image_width_mean,
+                    image_width_stddev,
+                    image_height_mean,
+                    image_height_stddev,
+                    image_format,
+                    output_format,
                 )
                 generic_dataset_json = (
                     cls._convert_input_synthetic_or_file_dataset_to_generic_json(
@@ -301,6 +360,9 @@ def get_generic_dataset_json(
             elif input_type == PromptSource.FILE:
                 input_filename = cast(Path, input_filename)
                 input_file_dataset = cls._get_input_dataset_from_file(input_filename)
+                input_file_dataset = cls._encode_images_in_input_dataset(
+                    input_file_dataset
+                )
                 generic_dataset_json = (
                     cls._convert_input_synthetic_or_file_dataset_to_generic_json(
                         input_file_dataset
@@ -309,6 +371,14 @@ def get_generic_dataset_json(
             else:
                 raise GenAIPerfException("Input source is not recognized.")
 
+            # When the generic_dataset_json contains multi-modal data (e.g. images),
+            # convert the format of the content to OpenAI multi-modal format:
+            # see https://platform.openai.com/docs/guides/vision
+            if output_format == OutputFormat.OPENAI_VISION:
+                generic_dataset_json = cls._convert_to_openai_multi_modal_content(
+                    generic_dataset_json
+                )
+
         return generic_dataset_json
 
     @classmethod
@@ -405,17 +475,36 @@ def _get_input_dataset_from_synthetic(
         prompt_tokens_mean: int,
         prompt_tokens_stddev: int,
         num_of_output_prompts: int,
+        image_width_mean: int,
+        image_width_stddev: int,
+        image_height_mean: int,
+        image_height_stddev: int,
+        image_format: ImageFormat,
+        output_format: OutputFormat,
     ) -> Dict[str, Any]:
         dataset_json: Dict[str, Any] = {}
         dataset_json["features"] = [{"name": "text_input"}]
         dataset_json["rows"] = []
         for _ in range(num_of_output_prompts):
+            row: Dict["str", Any] = {"row": {}}
             synthetic_prompt = cls._create_synthetic_prompt(
                 tokenizer,
                 prompt_tokens_mean,
                 prompt_tokens_stddev,
             )
-            dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}})
+            row["row"]["text_input"] = synthetic_prompt
+
+            if output_format == OutputFormat.OPENAI_VISION:
+                synthetic_image = cls._create_synthetic_image(
+                    image_width_mean=image_width_mean,
+                    image_width_stddev=image_width_stddev,
+                    image_height_mean=image_height_mean,
+                    image_height_stddev=image_height_stddev,
+                    image_format=image_format,
+                )
+                row["row"]["image"] = synthetic_image
+
+            dataset_json["rows"].append(row)
 
         return dataset_json
 
@@ -497,29 +586,37 @@ def _add_rows_to_generic_json(
     @classmethod
     def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict:
         """
-        Reads the input prompts from a JSONL file and converts them into the required dataset format.
+        Reads the input prompts and images from a JSONL file and converts them
+        into the required dataset format.
 
         Parameters
         ----------
         input_filename : Path
-            The path to the input file containing the prompts in JSONL format.
+            The path to the input file containing the prompts and/or images in
+            JSONL format.
 
         Returns
         -------
         Dict
-            The dataset in the required format with the prompts read from the file.
+            The dataset in the required format with the prompts and/or images
+            read from the file.
         """
         cls.verify_file(input_filename)
-        input_file_prompts = cls._get_prompts_from_input_file(input_filename)
+        prompts, images = cls._get_prompts_from_input_file(input_filename)
         dataset_json: Dict[str, Any] = {}
         dataset_json["features"] = [{"name": "text_input"}]
-        dataset_json["rows"] = [
-            {"row": {"text_input": prompt}} for prompt in input_file_prompts
-        ]
+        dataset_json["rows"] = []
+        for prompt, image in zip(prompts, images):
+            content = {"text_input": prompt}
+            content.update({"image": image} if image else {})
+            dataset_json["rows"].append({"row": content})
+
         return dataset_json
 
     @classmethod
-    def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
+    def _get_prompts_from_input_file(
+        cls, input_filename: Path
+    ) -> Tuple[List[str], List[str]]:
         """
         Reads the input prompts from a JSONL file and returns a list of prompts.
 
@@ -530,21 +627,63 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
 
         Returns
         -------
-        List[str]
-            A list of prompts read from the file.
+        Tuple[List[str], List[str]]
+            A list of prompts and images read from the file.
         """
         prompts = []
+        images = []
         with open(input_filename, mode="r", newline=None) as file:
             for line in file:
                 if line.strip():
                     prompts.append(load_json_str(line).get("text_input", "").strip())
-        return prompts
+                    images.append(load_json_str(line).get("image", "").strip())
+        return prompts, images
 
     @classmethod
     def verify_file(cls, input_filename: Path) -> None:
         if not input_filename.exists():
             raise FileNotFoundError(f"The file '{input_filename}' does not exist.")
 
+    @classmethod
+    def _convert_to_openai_multi_modal_content(
+        cls, generic_dataset_json: Dict[str, List[Dict]]
+    ) -> Dict[str, List[Dict]]:
+        """
+        Converts to multi-modal content format of OpenAI Chat Completions API.
+        """
+        for row in generic_dataset_json["rows"]:
+            if row["image"]:
+                row["text_input"] = [
+                    {
+                        "type": "text",
+                        "text": row["text_input"],
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": row["image"]},
+                    },
+                ]
+
+        return generic_dataset_json
+
+    @classmethod
+    def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict:
+        for row in input_file_dataset["rows"]:
+            filename = row["row"].get("image")
+            if filename:
+                img = Image.open(filename)
+                if img.format.lower() not in utils.get_enum_names(ImageFormat):
+                    raise GenAIPerfException(
+                        f"Unsupported image format '{img.format}' of "
+                        f"the image '{filename}'."
+                    )
+
+                img_base64 = utils.encode_image(img, img.format)
+                payload = f"data:image/{img.format.lower()};base64,{img_base64}"
+                row["row"]["image"] = payload
+
+        return input_file_dataset
+
     @classmethod
     def _convert_generic_json_to_output_format(
         cls,
@@ -559,7 +698,10 @@ def _convert_generic_json_to_output_format(
         model_name: list = [],
         model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
     ) -> Dict:
-        if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
+        if (
+            output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS
+            or output_format == OutputFormat.OPENAI_VISION
+        ):
             output_json = cls._convert_generic_json_to_openai_chat_completions_format(
                 generic_dataset,
                 add_model_name,
@@ -1424,3 +1566,20 @@ def _create_synthetic_prompt(
         return SyntheticPromptGenerator.create_synthetic_prompt(
             tokenizer, prompt_tokens_mean, prompt_tokens_stddev
         )
+
+    @classmethod
+    def _create_synthetic_image(
+        cls,
+        image_width_mean: int,
+        image_width_stddev: int,
+        image_height_mean: int,
+        image_height_stddev: int,
+        image_format: ImageFormat,
+    ) -> str:
+        return SyntheticImageGenerator.create_synthetic_image(
+            image_width_mean=image_width_mean,
+            image_width_stddev=image_width_stddev,
+            image_height_mean=image_height_mean,
+            image_height_stddev=image_height_stddev,
+            image_format=image_format,
+        )
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/dlss.png b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/dlss.png
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/h100.jpeg b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/h100.jpeg
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/h200.jpeg b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/h200.jpeg
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/jensen.jpeg b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/source_images/jensen.jpeg