Skip to content

Commit

Permalink
Support Vision Language Model in GenAI-Perf (#756)
Browse files Browse the repository at this point in the history
* POC LLaVA VLM support (#720)

* POC for LLaVA support

* non-streaming request in VLM tests

* image component sent in "image_url" field instead of HTML tag

* generate sample image instead of loading from docs

* add vision to endpoint mapping

* fixes for handling OutputFormat

* refactor - extract image preparation to a separate module

* fixes to the refactor

* replace match-case syntax with if-elseif-else

* Update image payload format and fix tests

* Few clean ups and tickets added for follow up tasks

* Fix and add tests for vision format

* Remove output format from profile data parser

* Revert irrelevant code change

* Revert changes

* Remove unused dependency

* Comment test_extra_inputs

---------

Co-authored-by: Hyunjae Woo <[email protected]>

* Support multi-modal input from file for OpenAI Chat Completions (#749)

* add synthetic image generator (#751)

* synthetic image generator

* format randomization

* images should be base64-encoded arbitrarly

* randomized image format

* randomized image shape

* prepare SyntheticImageGenerator to support different image sources

* read from files

* python 3.10 support fixes

* remove unused imports

* skip sampled image sizes with negative values

* formats type fix

* remove unused variable

* synthetic image generator encodes images to base64

* image format not randomized

* sample each dimension independently

Co-authored-by: Hyunjae Woo <[email protected]>

* apply code-review suggestsions

* update class name

* deterministic synthetic image generator

* add typing to SyntheticImageGenerator

* SyntheticImageGenerator doesn't load files

* SyntheticImageGenerator always encodes images to base64

* remove unused imports

* generate gaussian noise instead of blank images

---------

Co-authored-by: Hyunjae Woo <[email protected]>

* Add command line arguments for synthetic image generation (#753)

* Add CLI options for synthetic image generation

* read image format from file when --input-file is used

* move encode_image method to utils

* Lazy import some modules

* Support synthetic image generation in GenAI-Perf (#754)

* support synthetic image generation for VLM model

* add test

* integrate sythetic image generator into LlmInputs

* add source images for synthetic image data

* use abs to get positive int

---------

Co-authored-by: Marek Wawrzos <[email protected]>
  • Loading branch information
nv-hwoo and mwawrzos authored Jul 18, 2024
1 parent e4d9ef0 commit 30af885
Show file tree
Hide file tree
Showing 20 changed files with 793 additions and 111 deletions.
185 changes: 172 additions & 13 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,17 @@
from typing import Any, Dict, List, Optional, Tuple, cast

import requests
from genai_perf import utils
from genai_perf.constants import CNN_DAILY_MAIL, DEFAULT_INPUT_DATA_JSON, OPEN_ORCA
from genai_perf.exceptions import GenAIPerfException
from genai_perf.llm_inputs.synthetic_image_generator import (
ImageFormat,
SyntheticImageGenerator,
)
from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator
from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer
from genai_perf.utils import load_json_str
from PIL import Image
from requests import Response


Expand All @@ -43,6 +49,7 @@ class OutputFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
OPENAI_EMBEDDINGS = auto()
OPENAI_VISION = auto()
RANKINGS = auto()
TENSORRTLLM = auto()
VLLM = auto()
Expand Down Expand Up @@ -75,6 +82,11 @@ class LlmInputs:
DEFAULT_OUTPUT_TOKENS_STDDEV = 0
DEFAULT_NUM_PROMPTS = 100

DEFAULT_IMAGE_WIDTH_MEAN = 100
DEFAULT_IMAGE_WIDTH_STDDEV = 0
DEFAULT_IMAGE_HEIGHT_MEAN = 100
DEFAULT_IMAGE_HEIGHT_STDDEV = 0

EMPTY_JSON_IN_VLLM_PA_FORMAT: Dict = {"data": []}
EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT: Dict = {"data": []}
EMPTY_JSON_IN_OPENAI_PA_FORMAT: Dict = {"data": []}
Expand All @@ -97,6 +109,11 @@ def create_llm_inputs(
output_tokens_deterministic: bool = False,
prompt_tokens_mean: int = DEFAULT_PROMPT_TOKENS_MEAN,
prompt_tokens_stddev: int = DEFAULT_PROMPT_TOKENS_STDDEV,
image_width_mean: int = DEFAULT_IMAGE_WIDTH_MEAN,
image_width_stddev: int = DEFAULT_IMAGE_WIDTH_STDDEV,
image_height_mean: int = DEFAULT_IMAGE_HEIGHT_MEAN,
image_height_stddev: int = DEFAULT_IMAGE_HEIGHT_STDDEV,
image_format: ImageFormat = ImageFormat.PNG,
random_seed: int = DEFAULT_RANDOM_SEED,
num_of_output_prompts: int = DEFAULT_NUM_PROMPTS,
add_model_name: bool = False,
Expand Down Expand Up @@ -139,6 +156,16 @@ def create_llm_inputs(
The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided.
output_tokens_deterministic:
If true, the output tokens will set the minimum and maximum tokens to be equivalent.
image_width_mean:
The mean width of images when generating synthetic image data.
image_width_stddev:
The standard deviation of width of images when generating synthetic image data.
image_height_mean:
The mean height of images when generating synthetic image data.
image_height_stddev:
The standard deviation of height of images when generating synthetic image data.
image_format:
The compression format of the images.
batch_size:
The number of inputs per request (currently only used for the embeddings and rankings endpoints)
Expand Down Expand Up @@ -175,6 +202,11 @@ def create_llm_inputs(
prompt_tokens_mean,
prompt_tokens_stddev,
num_of_output_prompts,
image_width_mean,
image_width_stddev,
image_height_mean,
image_height_stddev,
image_format,
batch_size,
input_filename,
)
Expand Down Expand Up @@ -210,6 +242,11 @@ def get_generic_dataset_json(
prompt_tokens_mean: int,
prompt_tokens_stddev: int,
num_of_output_prompts: int,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
batch_size: int,
input_filename: Optional[Path],
) -> Dict:
Expand All @@ -236,6 +273,16 @@ def get_generic_dataset_json(
The standard deviation of the length of the prompt to generate
num_of_output_prompts:
The number of synthetic output prompts to generate
image_width_mean:
The mean width of images when generating synthetic image data.
image_width_stddev:
The standard deviation of width of images when generating synthetic image data.
image_height_mean:
The mean height of images when generating synthetic image data.
image_height_stddev:
The standard deviation of height of images when generating synthetic image data.
image_format:
The compression format of the images.
batch_size:
The number of inputs per request (currently only used for the embeddings and rankings endpoints)
input_filename:
Expand Down Expand Up @@ -280,6 +327,12 @@ def get_generic_dataset_json(
)
else:
if input_type == PromptSource.DATASET:
# (TMA-1990) support VLM input from public dataset
if output_format == OutputFormat.OPENAI_VISION:
raise GenAIPerfException(
f"{OutputFormat.OPENAI_VISION.to_lowercase()} currently "
"does not support dataset as input."
)
dataset = cls._get_input_dataset_from_url(
dataset_name, starting_index, length
)
Expand All @@ -292,6 +345,12 @@ def get_generic_dataset_json(
prompt_tokens_mean,
prompt_tokens_stddev,
num_of_output_prompts,
image_width_mean,
image_width_stddev,
image_height_mean,
image_height_stddev,
image_format,
output_format,
)
generic_dataset_json = (
cls._convert_input_synthetic_or_file_dataset_to_generic_json(
Expand All @@ -301,6 +360,9 @@ def get_generic_dataset_json(
elif input_type == PromptSource.FILE:
input_filename = cast(Path, input_filename)
input_file_dataset = cls._get_input_dataset_from_file(input_filename)
input_file_dataset = cls._encode_images_in_input_dataset(
input_file_dataset
)
generic_dataset_json = (
cls._convert_input_synthetic_or_file_dataset_to_generic_json(
input_file_dataset
Expand All @@ -309,6 +371,14 @@ def get_generic_dataset_json(
else:
raise GenAIPerfException("Input source is not recognized.")

# When the generic_dataset_json contains multi-modal data (e.g. images),
# convert the format of the content to OpenAI multi-modal format:
# see https://platform.openai.com/docs/guides/vision
if output_format == OutputFormat.OPENAI_VISION:
generic_dataset_json = cls._convert_to_openai_multi_modal_content(
generic_dataset_json
)

return generic_dataset_json

@classmethod
Expand Down Expand Up @@ -405,17 +475,36 @@ def _get_input_dataset_from_synthetic(
prompt_tokens_mean: int,
prompt_tokens_stddev: int,
num_of_output_prompts: int,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
output_format: OutputFormat,
) -> Dict[str, Any]:
dataset_json: Dict[str, Any] = {}
dataset_json["features"] = [{"name": "text_input"}]
dataset_json["rows"] = []
for _ in range(num_of_output_prompts):
row: Dict["str", Any] = {"row": {}}
synthetic_prompt = cls._create_synthetic_prompt(
tokenizer,
prompt_tokens_mean,
prompt_tokens_stddev,
)
dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}})
row["row"]["text_input"] = synthetic_prompt

if output_format == OutputFormat.OPENAI_VISION:
synthetic_image = cls._create_synthetic_image(
image_width_mean=image_width_mean,
image_width_stddev=image_width_stddev,
image_height_mean=image_height_mean,
image_height_stddev=image_height_stddev,
image_format=image_format,
)
row["row"]["image"] = synthetic_image

dataset_json["rows"].append(row)

return dataset_json

Expand Down Expand Up @@ -497,29 +586,37 @@ def _add_rows_to_generic_json(
@classmethod
def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict:
"""
Reads the input prompts from a JSONL file and converts them into the required dataset format.
Reads the input prompts and images from a JSONL file and converts them
into the required dataset format.
Parameters
----------
input_filename : Path
The path to the input file containing the prompts in JSONL format.
The path to the input file containing the prompts and/or images in
JSONL format.
Returns
-------
Dict
The dataset in the required format with the prompts read from the file.
The dataset in the required format with the prompts and/or images
read from the file.
"""
cls.verify_file(input_filename)
input_file_prompts = cls._get_prompts_from_input_file(input_filename)
prompts, images = cls._get_prompts_from_input_file(input_filename)
dataset_json: Dict[str, Any] = {}
dataset_json["features"] = [{"name": "text_input"}]
dataset_json["rows"] = [
{"row": {"text_input": prompt}} for prompt in input_file_prompts
]
dataset_json["rows"] = []
for prompt, image in zip(prompts, images):
content = {"text_input": prompt}
content.update({"image": image} if image else {})
dataset_json["rows"].append({"row": content})

return dataset_json

@classmethod
def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
def _get_prompts_from_input_file(
cls, input_filename: Path
) -> Tuple[List[str], List[str]]:
"""
Reads the input prompts from a JSONL file and returns a list of prompts.
Expand All @@ -530,21 +627,63 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
Returns
-------
List[str]
A list of prompts read from the file.
Tuple[List[str], List[str]]
A list of prompts and images read from the file.
"""
prompts = []
images = []
with open(input_filename, mode="r", newline=None) as file:
for line in file:
if line.strip():
prompts.append(load_json_str(line).get("text_input", "").strip())
return prompts
images.append(load_json_str(line).get("image", "").strip())
return prompts, images

@classmethod
def verify_file(cls, input_filename: Path) -> None:
if not input_filename.exists():
raise FileNotFoundError(f"The file '{input_filename}' does not exist.")

@classmethod
def _convert_to_openai_multi_modal_content(
cls, generic_dataset_json: Dict[str, List[Dict]]
) -> Dict[str, List[Dict]]:
"""
Converts to multi-modal content format of OpenAI Chat Completions API.
"""
for row in generic_dataset_json["rows"]:
if row["image"]:
row["text_input"] = [
{
"type": "text",
"text": row["text_input"],
},
{
"type": "image_url",
"image_url": {"url": row["image"]},
},
]

return generic_dataset_json

@classmethod
def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict:
for row in input_file_dataset["rows"]:
filename = row["row"].get("image")
if filename:
img = Image.open(filename)
if img.format.lower() not in utils.get_enum_names(ImageFormat):
raise GenAIPerfException(
f"Unsupported image format '{img.format}' of "
f"the image '{filename}'."
)

img_base64 = utils.encode_image(img, img.format)
payload = f"data:image/{img.format.lower()};base64,{img_base64}"
row["row"]["image"] = payload

return input_file_dataset

@classmethod
def _convert_generic_json_to_output_format(
cls,
Expand All @@ -559,7 +698,10 @@ def _convert_generic_json_to_output_format(
model_name: list = [],
model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
) -> Dict:
if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
if (
output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS
or output_format == OutputFormat.OPENAI_VISION
):
output_json = cls._convert_generic_json_to_openai_chat_completions_format(
generic_dataset,
add_model_name,
Expand Down Expand Up @@ -1424,3 +1566,20 @@ def _create_synthetic_prompt(
return SyntheticPromptGenerator.create_synthetic_prompt(
tokenizer, prompt_tokens_mean, prompt_tokens_stddev
)

@classmethod
def _create_synthetic_image(
cls,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
) -> str:
return SyntheticImageGenerator.create_synthetic_image(
image_width_mean=image_width_mean,
image_width_stddev=image_width_stddev,
image_height_mean=image_height_mean,
image_height_stddev=image_height_stddev,
image_format=image_format,
)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 30af885

Please sign in to comment.