Skip to content

Commit

Permalink
Add BaseFileInputRetriever
Browse files Browse the repository at this point in the history
WIP add start of support for payload params

Fix some MyPy errors

Add session_id, optional_data to payload in converters

Remove unncessary variable and unwanted override

Add a method to build schedule flag for PA

Move optional_data to be top level args

Update rankings convertor - untested

Modify variable name in cli unit test

Update test_wrapper.py

Update test_inpute_retriever factory with payload_input_retriever test, add payload_input_file in input config, update input file to payload input file in PayloadInputRetriever

Unit tests for converters

Route payload file to payload retriever

Add unit test for PayloadInputRetriever

Fix pre-commit errors

Fix failing test_cli.py

Exclude adding payload_input_file to JSON output
  • Loading branch information
debermudez authored and lkomali committed Dec 18, 2024
1 parent 7ca4922 commit c829674
Show file tree
Hide file tree
Showing 34 changed files with 1,199 additions and 141 deletions.
2 changes: 1 addition & 1 deletion genai-perf/genai_perf/export_data/json_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ def export(self) -> None:
0
]
filename = self._output_dir / f"{prefix}_genai_perf.json"
logger.info(f"Generating {filename}")
with open(str(filename), "w") as f:
f.write(json.dumps(self._stats_and_args, indent=2))

def _prepare_args_for_export(self) -> None:
self._args.pop("func", None)
self._args.pop("output_format", None)
self._args.pop("input_file", None)
self._args.pop("payload_input_file", None)
self._args["profile_export_file"] = str(self._args["profile_export_file"])
self._args["artifact_dir"] = str(self._args["artifact_dir"])
for k, v in self._args.items():
Expand Down
6 changes: 6 additions & 0 deletions genai-perf/genai_perf/inputs/converters/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ def _add_request_params(
) -> None:
for key, value in config.extra_inputs.items():
payload[key] = value

def _add_payload_params(
self, payload: Dict[Any, Any], optional_data: Dict[Any, Any]
) -> None:
for key, value in optional_data.items():
payload[key] = value
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def convert(
payload = {
"input": [{"type": "image_url", "url": img} for img in row.images]
}
self._add_payload_params(payload, row.optional_data)
request_body["data"].append({"payload": [payload]})

return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def convert(
}

self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
request_body["data"].append({"payload": [payload]})

return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def _create_payload(
}

self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
return payload

def _retrieve_content(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def convert(
"prompt": prompt,
}
self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
request_body["data"].append({"payload": [payload]})

return request_body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def convert(
"model": model_name,
"input": row.texts,
}

self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
request_body["data"].append({"payload": [payload]})

return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def convert(
}

self._add_request_params(payload, config)
self._add_payload_params(payload, passage_entry.optional_data)
request_body["data"].append({"payload": [payload]})

return request_body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def convert(
"text_input": [text],
"max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS], # default
}

self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
request_body["data"].append(payload)

return request_body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def convert(
"input_lengths": [len(token_ids)],
"request_output_len": [DEFAULT_TENSORRTLLM_MAX_TOKENS],
}

self._add_request_params(payload, config)
self._add_payload_params(payload, row.optional_data)
request_body["data"].append(payload)

return request_body
Expand Down
2 changes: 2 additions & 0 deletions genai-perf/genai_perf/inputs/converters/vllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def convert(
"text_input": text,
"exclude_input_in_output": [True], # default
}
optional_data = row.optional_data
self._add_request_params(payload, config)
self._add_payload_params(payload, optional_data)
request_body["data"].append(payload)

return request_body
Expand Down
1 change: 0 additions & 1 deletion genai-perf/genai_perf/inputs/input_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from enum import Enum, auto
from typing import Dict


class ModelSelectionStrategy(Enum):
Expand Down
3 changes: 3 additions & 0 deletions genai-perf/genai_perf/inputs/inputs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ class InputsConfig:
# The filenames used for synthetic data generation
synthetic_input_filenames: Optional[List[str]] = field(default_factory=list)

# The filename where payload input data is available
payload_input_filename: Optional[Path] = Path("")

# The compression format of the images.
image_format: ImageFormat = ImageFormat.PNG

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

from genai_perf.inputs.retrievers.base_input_retriever import BaseInputRetriever
from genai_perf.inputs.retrievers.generic_dataset import FileData, GenericDataset


class BaseFileInputRetriever(BaseInputRetriever):
"""
A base input retriever class that defines file input methods.
"""

def retrieve_data(self) -> GenericDataset:
"""
Retrieves the dataset from a file or directory.
"""
raise NotImplementedError("This method should be implemented by subclasses.")

def _get_input_dataset_from_file(self, filename: Path) -> FileData:
"""
Retrieves the dataset from a specific JSONL file.
"""

raise NotImplementedError("This method should be implemented by subclasses.")

def _verify_file(self, filename: Path) -> None:
"""
Verifies that the file exists.
Args
----------
filename : Path
The file path to verify.
Raises
------
FileNotFoundError
If the file does not exist.
"""
if not filename.exists():
raise FileNotFoundError(f"The file '{filename}' does not exist.")

def _get_content_from_input_file(
self, filename: Path
) -> Union[Tuple[List[str], List[str]], Tuple[List[str], List[Dict[Any, Any]]]]:
"""
Reads the content from a JSONL file and returns lists of each content type.
"""
raise NotImplementedError("This method should be implemented by subclasses.")
28 changes: 6 additions & 22 deletions genai-perf/genai_perf/inputs/retrievers/file_input_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@
from genai_perf import utils
from genai_perf.exceptions import GenAIPerfException
from genai_perf.inputs.input_constants import DEFAULT_BATCH_SIZE
from genai_perf.inputs.inputs_config import InputsConfig
from genai_perf.inputs.retrievers.base_input_retriever import BaseInputRetriever
from genai_perf.inputs.retrievers.base_file_input_retriever import (
BaseFileInputRetriever,
)
from genai_perf.inputs.retrievers.generic_dataset import (
DataRow,
FileData,
Expand All @@ -43,7 +44,7 @@
from PIL import Image


class FileInputRetriever(BaseInputRetriever):
class FileInputRetriever(BaseFileInputRetriever):
"""
A input retriever class that handles input data provided by the user through
file and directories.
Expand Down Expand Up @@ -115,24 +116,7 @@ def _get_input_dataset_from_file(self, filename: Path) -> FileData:
"""
self._verify_file(filename)
prompts, images = self._get_content_from_input_file(filename)
return self._convert_content_to_data_file(prompts, images, filename)

def _verify_file(self, filename: Path) -> None:
"""
Verifies that the file exists.
Args
----------
filename : Path
The file path to verify.
Raises
------
FileNotFoundError
If the file does not exist.
"""
if not filename.exists():
raise FileNotFoundError(f"The file '{filename}' does not exist.")
return self._convert_content_to_data_file(prompts, filename, images)

def _get_content_from_input_file(
self, filename: Path
Expand Down Expand Up @@ -206,7 +190,7 @@ def _encode_image(self, filename: str) -> str:
return payload

def _convert_content_to_data_file(
self, prompts: List[str], images: List[str], filename: Path
self, prompts: List[str], filename: Path, images: List[str] = []
) -> FileData:
"""
Converts the content to a DataFile.
Expand Down
23 changes: 16 additions & 7 deletions genai-perf/genai_perf/inputs/retrievers/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,34 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from dataclasses import dataclass, field
from typing import Dict, List, TypeAlias
from typing import Any, Dict, List, TypeAlias, Union

Filename: TypeAlias = str
TypeOfData: TypeAlias = str
ListOfData: TypeAlias = List[str]
DataRowDict: TypeAlias = Dict[TypeOfData, ListOfData]
DataRowDict: TypeAlias = Dict[str, Union[List[str], Dict[str, Any], str]]
GenericDatasetDict: TypeAlias = Dict[Filename, List[DataRowDict]]


@dataclass
class DataRow:
texts: List[str] = field(default_factory=list)
images: List[str] = field(default_factory=list)
optional_data: Dict[str, Any] = field(default_factory=dict)

def to_dict(self) -> DataRowDict:
"""
Converts the DataRow object to a dictionary.
"""
return {"texts": self.texts, "images": self.images}
datarow_dict: DataRowDict = {}

if self.texts:
datarow_dict["texts"] = self.texts
if self.images:
datarow_dict["images"] = self.images
if self.optional_data:
datarow_dict["optional_data"] = self.optional_data
return datarow_dict


@dataclass
Expand All @@ -55,8 +64,8 @@ def to_list(self) -> List[DataRowDict]:
Converts the FileData object to a list.
Output format example for two payloads from a file:
[
{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']},
{'texts': ['text3', 'text4'], 'images': ['image3', 'image4']}
{'texts': ['text1', 'text2'], 'images': ['image1', 'image2'], 'optional_data': {}, 'session_id': 'session_id1'},
{'texts': ['text3', 'text4'], 'images': ['image3', 'image4'], 'optional_data': {}, 'session_id': 'session_id2'},
]
"""
return [row.to_dict() for row in self.rows]
Expand All @@ -71,8 +80,8 @@ def to_dict(self) -> GenericDatasetDict:
Converts the entire DataStructure object to a dictionary.
Output format example for one payload from two files:
{
'file_0': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}],
'file_1': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}]
'file_0': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2'], 'optional_data': {}, 'session_id': 'session_id1'}],
'file_1': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2'], 'optional_data': {}, 'session_id': 'session_id2'}],
}
"""
return {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from genai_perf.inputs.inputs_config import InputsConfig
from genai_perf.inputs.retrievers.base_input_retriever import BaseInputRetriever
from genai_perf.inputs.retrievers.file_input_retriever import FileInputRetriever
from genai_perf.inputs.retrievers.payload_input_retriever import PayloadInputRetriever
from genai_perf.inputs.retrievers.synthetic_data_retriever import SyntheticDataRetriever


Expand All @@ -43,6 +44,7 @@ def create(config: InputsConfig) -> BaseInputRetriever:
retrievers = {
PromptSource.SYNTHETIC: SyntheticDataRetriever,
PromptSource.FILE: FileInputRetriever,
PromptSource.PAYLOAD: PayloadInputRetriever,
}
input_type = config.input_type
if input_type not in retrievers:
Expand Down
Loading

0 comments on commit c829674

Please sign in to comment.