From 85d3b8910073dabcf48a6347ea9627269a45685e Mon Sep 17 00:00:00 2001 From: Hyunjae Woo Date: Thu, 11 Jul 2024 09:48:21 -0700 Subject: [PATCH] support multi-modal input from file --- .../genai_perf/llm_inputs/llm_inputs.py | 82 ++++++++++++++----- .../genai-perf/tests/test_llm_inputs.py | 42 ++++++++-- 2 files changed, 96 insertions(+), 28 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 6fcd9372b..8f657ed42 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -326,6 +326,12 @@ def get_generic_dataset_json( ) else: if input_type == PromptSource.DATASET: + # (TMA-1990) support VLM input from public dataset + if output_format == OutputFormat.OPENAI_VISION: + raise GenAIPerfException( + f"{OutputFormat.OPENAI_VISION.to_lowercase()} currently " + "does not support dataset as input." + ) dataset = cls._get_input_dataset_from_url( dataset_name, starting_index, length ) @@ -333,6 +339,13 @@ def get_generic_dataset_json( dataset ) elif input_type == PromptSource.SYNTHETIC: + # (TMA-1989) support synthetic image generation for VLM input + if output_format == OutputFormat.OPENAI_VISION: + raise GenAIPerfException( + f"{OutputFormat.OPENAI_VISION.to_lowercase()} currently " + "does not support synthetic input." + ) + synthetic_dataset = cls._get_input_dataset_from_synthetic( tokenizer, prompt_tokens_mean, @@ -347,6 +360,9 @@ def get_generic_dataset_json( elif input_type == PromptSource.FILE: input_filename = cast(Path, input_filename) input_file_dataset = cls._get_input_dataset_from_file(input_filename) + input_file_dataset = cls._encode_images_in_input_dataset( + input_file_dataset + ) generic_dataset_json = ( cls._convert_input_synthetic_or_file_dataset_to_generic_json( input_file_dataset @@ -355,10 +371,12 @@ def get_generic_dataset_json( else: raise GenAIPerfException("Input source is not recognized.") + # When the generic_dataset_json contains multi-modal data (e.g. images), + # convert the format of the content to OpenAI multi-modal format: + # see https://platform.openai.com/docs/guides/vision if output_format == OutputFormat.OPENAI_VISION: - snowman_image = make_snowman_image() - generic_dataset_json = cls._add_images_to_generic_json( - generic_dataset_json, snowman_image + generic_dataset_json = cls._convert_to_openai_multi_modal_content( + generic_dataset_json ) return generic_dataset_json @@ -549,29 +567,37 @@ def _add_rows_to_generic_json( @classmethod def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict: """ - Reads the input prompts from a JSONL file and converts them into the required dataset format. + Reads the input prompts and images from a JSONL file and converts them + into the required dataset format. Parameters ---------- input_filename : Path - The path to the input file containing the prompts in JSONL format. + The path to the input file containing the prompts and/or images in + JSONL format. Returns ------- Dict - The dataset in the required format with the prompts read from the file. + The dataset in the required format with the prompts and/or images + read from the file. """ cls.verify_file(input_filename) - input_file_prompts = cls._get_prompts_from_input_file(input_filename) + prompts, images = cls._get_prompts_from_input_file(input_filename) dataset_json: Dict[str, Any] = {} dataset_json["features"] = [{"name": "text_input"}] - dataset_json["rows"] = [ - {"row": {"text_input": prompt}} for prompt in input_file_prompts - ] + dataset_json["rows"] = [] + for prompt, image in zip(prompts, images): + content = {"text_input": prompt} + content.update({"image": image} if image else {}) + dataset_json["rows"].append({"row": content}) + return dataset_json @classmethod - def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: + def _get_prompts_from_input_file( + cls, input_filename: Path + ) -> Tuple[List[str], List[str]]: """ Reads the input prompts from a JSONL file and returns a list of prompts. @@ -582,15 +608,17 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: Returns ------- - List[str] - A list of prompts read from the file. + Tuple[List[str], List[str]] + A list of prompts and images read from the file. """ prompts = [] + images = [] with open(input_filename, mode="r", newline=None) as file: for line in file: if line.strip(): prompts.append(json.loads(line).get("text_input", "").strip()) - return prompts + images.append(json.loads(line).get("image", "").strip()) + return prompts, images @classmethod def verify_file(cls, input_filename: Path) -> None: @@ -598,14 +626,14 @@ def verify_file(cls, input_filename: Path) -> None: raise FileNotFoundError(f"The file '{input_filename}' does not exist.") @classmethod - def _add_images_to_generic_json( - cls, generic_dataset_json: Dict[str, List[Dict]], img: Image + def _convert_to_openai_multi_modal_content( + cls, generic_dataset_json: Dict[str, List[Dict]] ) -> Dict[str, List[Dict]]: - # (TMA-1985) Support multiple image formats - img_format = ImageFormat.PNG - img_base64 = cls._encode_image(img, img_format) + """ + Converts to multi-modal content format of OpenAI Chat Completions API. + """ for row in generic_dataset_json["rows"]: - if isinstance(row["text_input"], str): + if row["image"]: row["text_input"] = [ { "type": "text", @@ -613,12 +641,24 @@ def _add_images_to_generic_json( }, { "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{img_base64}"}, + "image_url": {"url": row["image"]}, }, ] return generic_dataset_json + @classmethod + def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict: + for row in input_file_dataset["rows"]: + filename = row["row"].get("image") + if filename: + img = Image.open(filename) + # (TMA-1985) Support multiple image formats + img_base64 = cls._encode_image(img, ImageFormat.PNG) + row["row"]["image"] = f"data:image/png;base64,{img_base64}" + + return input_file_dataset + @classmethod def _encode_image(cls, img: Image, format=ImageFormat.PNG): """Encodes an image into base64 encoding.""" diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index e989224d1..ea9fe5b12 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -16,6 +16,7 @@ import os import random import statistics +from collections import namedtuple from pathlib import Path from unittest.mock import mock_open, patch @@ -32,6 +33,7 @@ make_snowman_image, ) from genai_perf.tokenizer import Tokenizer +from PIL import Image mocked_openorca_data = { "features": [ @@ -555,14 +557,12 @@ def test_llm_inputs_with_defaults(self, default_configured_url): def test_add_image_inputs_openai_vision(self) -> None: generic_json = { "rows": [ - {"text_input": "test input one"}, - {"text_input": "test input two"}, + {"text_input": "test input one", "image": "test_image1"}, + {"text_input": "test input two", "image": "test_image2"}, ] } - img = make_snowman_image() - encoded_img = LlmInputs._encode_image(img) - generic_json = LlmInputs._add_images_to_generic_json(generic_json, img) + generic_json = LlmInputs._convert_to_openai_multi_modal_content(generic_json) row1 = generic_json["rows"][0]["text_input"] assert row1 == [ @@ -572,7 +572,7 @@ def test_add_image_inputs_openai_vision(self) -> None: }, { "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{encoded_img}"}, + "image_url": {"url": "test_image1"}, }, ] @@ -584,7 +584,7 @@ def test_add_image_inputs_openai_vision(self) -> None: }, { "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{encoded_img}"}, + "image_url": {"url": "test_image2"}, }, ] @@ -725,6 +725,34 @@ def test_get_input_file_with_multiple_prompts(self, mock_file, mock_exists): for i, prompt in enumerate(expected_prompts): assert dataset["rows"][i]["row"]["text_input"] == prompt + @patch("pathlib.Path.exists", return_value=True) + @patch("PIL.Image.open", return_value=Image.new("RGB", (10, 10))) + @patch( + "builtins.open", + new_callable=mock_open, + read_data=( + '{"text_input": "prompt1", "image": "image1.png"}\n' + '{"text_input": "prompt2", "image": "image2.png"}\n' + '{"text_input": "prompt3", "image": "image3.png"}\n' + ), + ) + def test_get_input_file_with_multi_modal_data( + self, mock_exists, mock_image, mock_file + ): + Data = namedtuple("Data", ["text_input", "image"]) + expected_data = [ + Data(text_input="prompt1", image="image1.png"), + Data(text_input="prompt2", image="image2.png"), + Data(text_input="prompt3", image="image3.png"), + ] + dataset = LlmInputs._get_input_dataset_from_file(Path("somefile.txt")) + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_data) + for i, data in enumerate(expected_data): + assert dataset["rows"][i]["row"]["text_input"] == data.text_input + assert dataset["rows"][i]["row"]["image"] == data.image + @pytest.mark.parametrize( "seed, model_name_list, index,model_selection_strategy,expected_model", [