From 4048c7078df37882f4bac8b5e0bb9cf918cacc5e Mon Sep 17 00:00:00 2001 From: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:31:24 -0700 Subject: [PATCH] Add file input retriever class for reading from file source (#86) * add file input retriever * move input source check to inputs.py * create unittests for file input retriever --- .../genai_perf/inputs/file_input_retriever.py | 185 ++++++++++++++ .../inputs/input_retriever_factory.py | 192 ++------------- genai-perf/genai_perf/inputs/inputs.py | 19 +- genai-perf/tests/test_file_input_retriever.py | 227 ++++++++++++++++++ genai-perf/tests/test_input_embeddings.py | 9 +- .../tests/test_input_image_retrieval.py | 4 +- genai-perf/tests/test_input_rankings.py | 68 ------ .../tests/test_input_retriever_factory.py | 93 +------ 8 files changed, 453 insertions(+), 344 deletions(-) create mode 100644 genai-perf/genai_perf/inputs/file_input_retriever.py create mode 100644 genai-perf/tests/test_file_input_retriever.py diff --git a/genai-perf/genai_perf/inputs/file_input_retriever.py b/genai-perf/genai_perf/inputs/file_input_retriever.py new file mode 100644 index 00000000..940b8398 --- /dev/null +++ b/genai-perf/genai_perf/inputs/file_input_retriever.py @@ -0,0 +1,185 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import random +from pathlib import Path +from typing import Any, Dict, List, Tuple + +from genai_perf.inputs.input_constants import DEFAULT_BATCH_SIZE, OutputFormat +from genai_perf.inputs.inputs_config import InputsConfig +from genai_perf.utils import load_json_str + + +class FileInputRetriever: + """ + A input retriever class that handles input data provided by the user through + file and directories. + """ + + def __init__(self, config: InputsConfig) -> None: + self.config = config + + # TODO: match return type to retriever interface + def retrieve_data(self) -> Dict[str, Any]: + if self.config.output_format == OutputFormat.OPENAI_EMBEDDINGS: + return self._get_input_dataset_from_embeddings_file() + elif self.config.output_format == OutputFormat.RANKINGS: + queries_filename = self.config.input_filename / "queries.jsonl" + passages_filename = self.config.input_filename / "passages.jsonl" + return self._get_input_dataset_from_rankings_files( + queries_filename, passages_filename + ) + elif self.config.output_format == OutputFormat.IMAGE_RETRIEVAL: + return self._get_input_dataset_from_file() + else: + return self._get_input_dataset_from_file() + + def _get_input_dataset_from_embeddings_file(self) -> Dict[str, Any]: + with open(self.config.input_filename, "r") as file: + file_content = [load_json_str(line) for line in file] + + texts = [item["text"] for item in file_content] + + if self.config.batch_size > len(texts): + raise ValueError( + "Batch size cannot be larger than the number of available texts" + ) + + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "input"}] + dataset_json["rows"] = [] + + for _ in range(self.config.num_prompts): + sampled_texts = random.sample(texts, self.config.batch_size) + dataset_json["rows"].append({"row": {"payload": {"input": sampled_texts}}}) + + return dataset_json + + def _get_input_dataset_from_rankings_files( + self, + queries_filename: Path, + passages_filename: Path, + ) -> Dict[str, Any]: + + with open(queries_filename, "r") as file: + queries_content = [load_json_str(line) for line in file] + queries_texts = [item for item in queries_content] + + with open(passages_filename, "r") as file: + passages_content = [load_json_str(line) for line in file] + passages_texts = [item for item in passages_content] + + if self.config.batch_size > len(passages_texts): + raise ValueError( + "Batch size cannot be larger than the number of available passages" + ) + + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "input"}] + dataset_json["rows"] = [] + + for _ in range(self.config.num_prompts): + sampled_texts = random.sample(passages_texts, self.config.batch_size) + query_sample = random.choice(queries_texts) + entry_dict: Dict = {} + entry_dict["query"] = query_sample + entry_dict["passages"] = sampled_texts + dataset_json["rows"].append({"row": {"payload": entry_dict}}) + return dataset_json + + def _get_input_dataset_from_file(self) -> Dict[str, Any]: + """ + Returns + ------- + Dict + The dataset in the required format with the prompts and/or images + read from the file. + """ + self._verify_file() + prompts, images = self._get_prompts_from_input_file() + if self.config.batch_size > len(prompts): + raise ValueError( + "Batch size cannot be larger than the number of available texts" + ) + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "text_input"}] + dataset_json["rows"] = [] + + if self.config.batch_size == DEFAULT_BATCH_SIZE: + for prompt, image in zip(prompts, images): + content = {} + if prompt is not None: + content["text_input"] = prompt + if image is not None: + content["image"] = image + dataset_json["rows"].append({"row": content}) + else: + for _ in range(self.config.num_prompts): + content_array = [] + sampled_indices = random.sample( + range(len(prompts)), self.config.batch_size + ) + sampled_texts_images = [ + (prompts[i], images[i]) for i in sampled_indices + ] + + for prompt, image in sampled_texts_images: + content = {} + if prompt is not None: + content["text_input"] = prompt + if image is not None: + content["image"] = image + content_array.append(content) + dataset_json["rows"].append({"row": content_array}) + + return dataset_json + + def _verify_file(self) -> None: + if not self.config.input_filename.exists(): + raise FileNotFoundError( + f"The file '{self.config.input_filename}' does not exist." + ) + + def _get_prompts_from_input_file(self) -> Tuple[List[str], List[str]]: + """ + Reads the input prompts from a JSONL file and returns a list of prompts. + + Returns + ------- + Tuple[List[str], List[str]] + A list of prompts and images read from the file. + """ + prompts = [] + images = [] + with open(self.config.input_filename, mode="r", newline=None) as file: + for line in file: + if line.strip(): + # None if not provided + prompt = load_json_str(line).get("text_input") + image = load_json_str(line).get("image") + prompts.append(prompt.strip() if prompt else prompt) + images.append(image.strip() if image else image) + return prompts, images diff --git a/genai-perf/genai_perf/inputs/input_retriever_factory.py b/genai-perf/genai_perf/inputs/input_retriever_factory.py index 4da5fbf4..618f47c4 100644 --- a/genai-perf/genai_perf/inputs/input_retriever_factory.py +++ b/genai-perf/genai_perf/inputs/input_retriever_factory.py @@ -24,15 +24,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import random -from pathlib import Path -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import requests from genai_perf import utils from genai_perf.exceptions import GenAIPerfException +from genai_perf.inputs.file_input_retriever import FileInputRetriever from genai_perf.inputs.input_constants import ( - DEFAULT_BATCH_SIZE, OutputFormat, PromptSource, dataset_url_map, @@ -43,7 +41,6 @@ SyntheticImageGenerator, ) from genai_perf.inputs.synthetic_prompt_generator import SyntheticPromptGenerator -from genai_perf.utils import load_json_str from PIL import Image from requests import Response @@ -62,44 +59,21 @@ def get_input_data(self) -> Dict: The generic dataset JSON """ - if self.config.output_format == OutputFormat.OPENAI_EMBEDDINGS: - if self.config.input_type != PromptSource.FILE: - raise GenAIPerfException( - f"{OutputFormat.OPENAI_EMBEDDINGS.to_lowercase()} only supports a file as input." - ) - input_file_dataset = self._get_input_dataset_from_embeddings_file() - generic_dataset_json = ( - self._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - elif self.config.output_format == OutputFormat.RANKINGS: - if self.config.input_type != PromptSource.FILE: - raise GenAIPerfException( - f"{OutputFormat.RANKINGS.to_lowercase()} only supports a directory as input." - ) - queries_filename = self.config.input_filename / "queries.jsonl" - passages_filename = self.config.input_filename / "passages.jsonl" - input_file_dataset = self._get_input_dataset_from_rankings_files( - queries_filename, passages_filename - ) - generic_dataset_json = ( - self._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - elif self.config.output_format == OutputFormat.IMAGE_RETRIEVAL: - if self.config.input_type != PromptSource.FILE: - raise GenAIPerfException( - f"{OutputFormat.IMAGE_RETRIEVAL.to_lowercase()} only supports a file as input." - ) - input_file_dataset = self._get_input_dataset_from_file() - input_file_dataset = self._encode_images_in_input_dataset( - input_file_dataset - ) + if self.config.output_format in [ + OutputFormat.OPENAI_EMBEDDINGS, + OutputFormat.RANKINGS, + OutputFormat.IMAGE_RETRIEVAL, + ]: + # TODO: remove once the factory fully integrates retrievers + file_retriever = FileInputRetriever(self.config) + input_data = file_retriever.retrieve_data() + + if self.config.output_format == OutputFormat.IMAGE_RETRIEVAL: + input_data = self._encode_images_in_input_dataset(input_data) + generic_dataset_json = ( self._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset + input_data ) ) else: @@ -122,10 +96,11 @@ def get_input_data(self) -> Dict: ) ) elif self.config.input_type == PromptSource.FILE: - input_file_dataset = self._get_input_dataset_from_file() - input_file_dataset = self._encode_images_in_input_dataset( - input_file_dataset - ) + # TODO: remove once the factory fully integrates retrievers + file_retriever = FileInputRetriever(self.config) + input_data = file_retriever.retrieve_data() + + input_file_dataset = self._encode_images_in_input_dataset(input_data) generic_dataset_json = ( self._convert_input_synthetic_or_file_dataset_to_generic_json( input_file_dataset @@ -136,27 +111,6 @@ def get_input_data(self) -> Dict: return generic_dataset_json - def _get_input_dataset_from_embeddings_file(self) -> Dict[str, Any]: - with open(self.config.input_filename, "r") as file: - file_content = [load_json_str(line) for line in file] - - texts = [item["text"] for item in file_content] - - if self.config.batch_size > len(texts): - raise ValueError( - "Batch size cannot be larger than the number of available texts" - ) - - dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "input"}] - dataset_json["rows"] = [] - - for _ in range(self.config.num_prompts): - sampled_texts = random.sample(texts, self.config.batch_size) - dataset_json["rows"].append({"row": {"payload": {"input": sampled_texts}}}) - - return dataset_json - def _convert_input_synthetic_or_file_dataset_to_generic_json( self, dataset: Dict ) -> Dict[str, List[Dict]]: @@ -164,85 +118,6 @@ def _convert_input_synthetic_or_file_dataset_to_generic_json( return generic_dataset_json - def _get_input_dataset_from_rankings_files( - self, - queries_filename: Path, - passages_filename: Path, - ) -> Dict[str, Any]: - - with open(queries_filename, "r") as file: - queries_content = [load_json_str(line) for line in file] - queries_texts = [item for item in queries_content] - - with open(passages_filename, "r") as file: - passages_content = [load_json_str(line) for line in file] - passages_texts = [item for item in passages_content] - - if self.config.batch_size > len(passages_texts): - raise ValueError( - "Batch size cannot be larger than the number of available passages" - ) - - dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "input"}] - dataset_json["rows"] = [] - - for _ in range(self.config.num_prompts): - sampled_texts = random.sample(passages_texts, self.config.batch_size) - query_sample = random.choice(queries_texts) - entry_dict: Dict = {} - entry_dict["query"] = query_sample - entry_dict["passages"] = sampled_texts - dataset_json["rows"].append({"row": {"payload": entry_dict}}) - return dataset_json - - def _get_input_dataset_from_file(self) -> Dict: - """ - Returns - ------- - Dict - The dataset in the required format with the prompts and/or images - read from the file. - """ - self._verify_file() - prompts, images = self._get_prompts_from_input_file() - if self.config.batch_size > len(prompts): - raise ValueError( - "Batch size cannot be larger than the number of available texts" - ) - dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "text_input"}] - dataset_json["rows"] = [] - - if self.config.batch_size == DEFAULT_BATCH_SIZE: - for prompt, image in zip(prompts, images): - content = {} - if prompt is not None: - content["text_input"] = prompt - if image is not None: - content["image"] = image - dataset_json["rows"].append({"row": content}) - else: - for _ in range(self.config.num_prompts): - content_array = [] - sampled_indices = random.sample( - range(len(prompts)), self.config.batch_size - ) - sampled_texts_images = [ - (prompts[i], images[i]) for i in sampled_indices - ] - - for prompt, image in sampled_texts_images: - content = {} - if prompt is not None: - content["text_input"] = prompt - if image is not None: - content["image"] = image - content_array.append(content) - dataset_json["rows"].append({"row": content_array}) - - return dataset_json - def _encode_images_in_input_dataset(self, input_file_dataset: Dict) -> Dict: for row in input_file_dataset["rows"]: if isinstance(row["row"], list): @@ -323,27 +198,6 @@ def _add_rows_to_generic_json( return generic_input_json - def _get_prompts_from_input_file(self) -> Tuple[List[str], List[str]]: - """ - Reads the input prompts from a JSONL file and returns a list of prompts. - - Returns - ------- - Tuple[List[str], List[str]] - A list of prompts and images read from the file. - """ - prompts = [] - images = [] - with open(self.config.input_filename, mode="r", newline=None) as file: - for line in file: - if line.strip(): - # None if not provided - prompt = load_json_str(line).get("text_input") - image = load_json_str(line).get("image") - prompts.append(prompt.strip() if prompt else prompt) - images.append(image.strip() if image else image) - return prompts, images - def _encode_image(self, filename: str) -> str: img = Image.open(filename) if img is None: @@ -403,12 +257,6 @@ def _create_synthetic_image(self) -> str: image_format=self.config.image_format, ) - def _verify_file(self) -> None: - if not self.config.input_filename.exists(): - raise FileNotFoundError( - f"The file '{self.config.input_filename}' does not exist." - ) - def _query_server(self, configured_url: str) -> Response: try: response = requests.get(configured_url) diff --git a/genai-perf/genai_perf/inputs/inputs.py b/genai-perf/genai_perf/inputs/inputs.py index 59c3a21d..c271aabb 100644 --- a/genai-perf/genai_perf/inputs/inputs.py +++ b/genai-perf/genai_perf/inputs/inputs.py @@ -61,17 +61,12 @@ def create_inputs(self) -> Dict: return json_in_pa_format def _check_for_valid_args(self) -> None: + self._check_for_supported_input_type() self._check_for_dataset_name_if_input_type_is_url() self._check_for_tokenzier_if_input_type_is_synthetic() self._check_for_valid_starting_index() self._check_for_valid_length() - def _verify_file(self) -> None: - if not self.config.input_filename.exists(): - raise FileNotFoundError( - f"The file '{self.config.input_filename}' does not exist." - ) - def _convert_generic_json_to_output_format(self, generic_dataset) -> Dict: converter = OutputFormatConverterFactory.create(self.config.output_format) return converter.convert(generic_dataset, self.config) @@ -81,6 +76,18 @@ def _write_json_to_file(self, json_in_pa_format: Dict) -> None: with open(str(filename), "w") as f: f.write(json.dumps(json_in_pa_format, indent=2)) + def _check_for_supported_input_type(self) -> None: + if self.config.output_format in [ + OutputFormat.OPENAI_EMBEDDINGS, + OutputFormat.RANKINGS, + OutputFormat.IMAGE_RETRIEVAL, + ]: + if self.config.input_type != PromptSource.FILE: + raise GenAIPerfException( + f"{self.config.output_format.to_lowercase()} only supports " + "a file as input source." + ) + def _check_for_dataset_name_if_input_type_is_url(self) -> None: if ( self.config.input_type == PromptSource.DATASET diff --git a/genai-perf/tests/test_file_input_retriever.py b/genai-perf/tests/test_file_input_retriever.py new file mode 100644 index 00000000..59d04644 --- /dev/null +++ b/genai-perf/tests/test_file_input_retriever.py @@ -0,0 +1,227 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections import namedtuple +from pathlib import Path +from unittest.mock import mock_open, patch + +import pytest +from genai_perf.inputs.file_input_retriever import FileInputRetriever +from genai_perf.inputs.input_constants import ModelSelectionStrategy +from genai_perf.inputs.inputs_config import InputsConfig +from PIL import Image + + +class TestFileInputRetriever: + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data="\n".join( + [ + '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', + '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', + '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', + '{"text": "what state did they film daddy\'s home 2"}', + ] + ), + ) + def test_get_input_dataset_from_embeddings_file(self, mock_file, mock_exists): + batch_size = 3 + config = InputsConfig( + input_filename=Path("embeddings.jsonl"), + batch_size=batch_size, + num_prompts=100, + ) + file_retriever = FileInputRetriever(config) + dataset = file_retriever._get_input_dataset_from_embeddings_file() + + assert dataset is not None + assert len(dataset["rows"]) == 100 + for row in dataset["rows"]: + assert "row" in row + assert "payload" in row["row"] + payload = row["row"]["payload"] + assert "input" in payload + assert isinstance(payload["input"], list) + assert len(payload["input"]) == batch_size + + # Try error case where batch size is larger than the number of available texts + with pytest.raises( + ValueError, + match="Batch size cannot be larger than the number of available texts", + ): + config.batch_size = 5 + file_retriever._get_input_dataset_from_embeddings_file() + + def open_side_effects(self, filepath, *args, **kwargs): + queries_content = "\n".join( + [ + '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', + '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', + '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', + ] + ) + passages_content = "\n".join( + [ + '{"text": "Eric Anderson (sociologist) Eric Anderson (born January 18, 1968) is an American sociologist"}', + '{"text": "Kevin Loader is a British film and television producer. "}', + '{"text": "Barton Mine, also known as Net Lake Mine, is an abandoned surface and underground mine in Northeastern Ontario"}', + ] + ) + + file_contents = { + "queries.jsonl": queries_content, + "passages.jsonl": passages_content, + } + return mock_open( + read_data=file_contents.get(filepath, file_contents["queries.jsonl"]) + )() + + mock_open_obj = mock_open() + mock_open_obj.side_effect = open_side_effects + + @patch("pathlib.Path.exists", return_value=True) + @patch("builtins.open", mock_open_obj) + def test_get_input_dataset_from_rankings_file(self, mock_file): + queries_filename = Path("queries.jsonl") + passages_filename = Path("passages.jsonl") + batch_size = 2 + config = InputsConfig( + batch_size=batch_size, + num_prompts=100, + ) + file_retriever = FileInputRetriever(config) + dataset = file_retriever._get_input_dataset_from_rankings_files( + queries_filename=queries_filename, passages_filename=passages_filename + ) + + assert dataset is not None + assert len(dataset["rows"]) == 100 + for row in dataset["rows"]: + assert "row" in row + assert "payload" in row["row"] + payload = row["row"]["payload"] + assert "query" in payload + assert "passages" in payload + assert isinstance(payload["passages"], list) + assert len(payload["passages"]) == batch_size + + # Try error case where batch size is larger than the number of available texts + with pytest.raises( + ValueError, + match="Batch size cannot be larger than the number of available passages", + ): + config.batch_size = 5 + file_retriever._get_input_dataset_from_rankings_files( + queries_filename, passages_filename + ) + + def test_get_input_file_without_file_existing(self): + file_retriever = FileInputRetriever( + InputsConfig(input_filename=Path("prompt.txt")) + ) + with pytest.raises(FileNotFoundError): + file_retriever._get_input_dataset_from_file() + + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"text_input": "single prompt"}\n', + ) + def test_get_input_file_with_single_prompt(self, mock_file, mock_exists): + expected_prompts = ["single prompt"] + file_retriever = FileInputRetriever( + InputsConfig( + model_name=["test_model_A"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + input_filename=Path("prompt.txt"), + ) + ) + dataset = file_retriever._get_input_dataset_from_file() + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_prompts) + for i, prompt in enumerate(expected_prompts): + assert dataset["rows"][i]["row"]["text_input"] == prompt + + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"text_input": "prompt1"}\n{"text_input": "prompt2"}\n{"text_input": "prompt3"}\n', + ) + def test_get_input_file_with_multiple_prompts(self, mock_file, mock_exists): + expected_prompts = ["prompt1", "prompt2", "prompt3"] + file_retriever = FileInputRetriever( + InputsConfig( + model_name=["test_model_A"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + input_filename=Path("prompt.txt"), + ) + ) + dataset = file_retriever._get_input_dataset_from_file() + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_prompts) + for i, prompt in enumerate(expected_prompts): + assert dataset["rows"][i]["row"]["text_input"] == prompt + + @patch("pathlib.Path.exists", return_value=True) + @patch("PIL.Image.open", return_value=Image.new("RGB", (10, 10))) + @patch( + "builtins.open", + new_callable=mock_open, + read_data=( + '{"text_input": "prompt1", "image": "image1.png"}\n' + '{"text_input": "prompt2", "image": "image2.png"}\n' + '{"text_input": "prompt3", "image": "image3.png"}\n' + ), + ) + def test_get_input_file_with_multi_modal_data( + self, mock_exists, mock_image, mock_file + ): + file_retriever = FileInputRetriever( + InputsConfig( + model_name=["test_model_A"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + input_filename=Path("prompt.txt"), + ) + ) + Data = namedtuple("Data", ["text_input", "image"]) + expected_data = [ + Data(text_input="prompt1", image="image1.png"), + Data(text_input="prompt2", image="image2.png"), + Data(text_input="prompt3", image="image3.png"), + ] + dataset = file_retriever._get_input_dataset_from_file() + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_data) + for i, data in enumerate(expected_data): + assert dataset["rows"][i]["row"]["text_input"] == data.text_input + assert dataset["rows"][i]["row"]["image"] == data.image diff --git a/genai-perf/tests/test_input_embeddings.py b/genai-perf/tests/test_input_embeddings.py index 280538d8..2172385d 100644 --- a/genai-perf/tests/test_input_embeddings.py +++ b/genai-perf/tests/test_input_embeddings.py @@ -28,12 +28,12 @@ from unittest.mock import mock_open, patch import pytest +from genai_perf.inputs.file_input_retriever import FileInputRetriever from genai_perf.inputs.input_constants import ( ModelSelectionStrategy, OutputFormat, PromptSource, ) -from genai_perf.inputs.input_retriever_factory import InputRetrieverFactory from genai_perf.inputs.inputs import Inputs from genai_perf.inputs.inputs_config import InputsConfig @@ -59,9 +59,8 @@ def test_get_input_dataset_from_embeddings_file(self, mock_file, mock_exists): batch_size=batch_size, num_prompts=100, ) - input_retriever_factory = InputRetrieverFactory(config) - - dataset = input_retriever_factory._get_input_dataset_from_embeddings_file() + file_retriever = FileInputRetriever(config) + dataset = file_retriever._get_input_dataset_from_embeddings_file() assert dataset is not None assert len(dataset["rows"]) == 100 @@ -79,7 +78,7 @@ def test_get_input_dataset_from_embeddings_file(self, mock_file, mock_exists): match="Batch size cannot be larger than the number of available texts", ): config.batch_size = 5 - input_retriever_factory._get_input_dataset_from_embeddings_file() + file_retriever._get_input_dataset_from_embeddings_file() def test_convert_generic_json_to_openai_embeddings_format(self): generic_dataset = { diff --git a/genai-perf/tests/test_input_image_retrieval.py b/genai-perf/tests/test_input_image_retrieval.py index 9c34d4ca..4a8535da 100644 --- a/genai-perf/tests/test_input_image_retrieval.py +++ b/genai-perf/tests/test_input_image_retrieval.py @@ -27,7 +27,7 @@ class TestInputsImageRetrieval: return_value="", ) @patch( - "genai_perf.inputs.input_retriever_factory.InputRetrieverFactory._get_input_dataset_from_file" + "genai_perf.inputs.input_retriever_factory.FileInputRetriever._get_input_dataset_from_file" ) def test_image_retrieval(self, mock_get_input, mock_encode_image, mock_write_json): mock_get_input.return_value = { @@ -80,7 +80,7 @@ def test_image_retrieval(self, mock_get_input, mock_encode_image, mock_write_jso @patch("genai_perf.inputs.inputs.Inputs._write_json_to_file") @patch( - "genai_perf.inputs.input_retriever_factory.InputRetrieverFactory._get_input_dataset_from_file" + "genai_perf.inputs.input_retriever_factory.FileInputRetriever._get_input_dataset_from_file" ) @patch( "genai_perf.inputs.input_retriever_factory.InputRetrieverFactory._encode_image" diff --git a/genai-perf/tests/test_input_rankings.py b/genai-perf/tests/test_input_rankings.py index 89ad1979..79fcb964 100644 --- a/genai-perf/tests/test_input_rankings.py +++ b/genai-perf/tests/test_input_rankings.py @@ -24,85 +24,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from pathlib import Path -from unittest.mock import mock_open, patch - -import pytest from genai_perf.inputs.input_constants import ( ModelSelectionStrategy, OutputFormat, PromptSource, ) -from genai_perf.inputs.input_retriever_factory import InputRetrieverFactory from genai_perf.inputs.inputs import Inputs from genai_perf.inputs.inputs_config import InputsConfig class TestInputsRankings: - def open_side_effects(self, filepath, *args, **kwargs): - queries_content = "\n".join( - [ - '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', - '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', - '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', - ] - ) - passages_content = "\n".join( - [ - '{"text": "Eric Anderson (sociologist) Eric Anderson (born January 18, 1968) is an American sociologist"}', - '{"text": "Kevin Loader is a British film and television producer. "}', - '{"text": "Barton Mine, also known as Net Lake Mine, is an abandoned surface and underground mine in Northeastern Ontario"}', - ] - ) - - file_contents = { - "queries.jsonl": queries_content, - "passages.jsonl": passages_content, - } - return mock_open( - read_data=file_contents.get(filepath, file_contents["queries.jsonl"]) - )() - - mock_open_obj = mock_open() - mock_open_obj.side_effect = open_side_effects - - @patch("pathlib.Path.exists", return_value=True) - @patch("builtins.open", mock_open_obj) - def test_get_input_dataset_from_rankings_file(self, mock_file): - queries_filename = Path("queries.jsonl") - passages_filename = Path("passages.jsonl") - batch_size = 2 - config = InputsConfig( - batch_size=batch_size, - num_prompts=100, - ) - input_retriever_factory = InputRetrieverFactory(config) - dataset = input_retriever_factory._get_input_dataset_from_rankings_files( - queries_filename=queries_filename, passages_filename=passages_filename - ) - - assert dataset is not None - assert len(dataset["rows"]) == 100 - for row in dataset["rows"]: - assert "row" in row - assert "payload" in row["row"] - payload = row["row"]["payload"] - assert "query" in payload - assert "passages" in payload - assert isinstance(payload["passages"], list) - assert len(payload["passages"]) == batch_size - - # Try error case where batch size is larger than the number of available texts - with pytest.raises( - ValueError, - match="Batch size cannot be larger than the number of available passages", - ): - config.batch_size = 5 - input_retriever_factory._get_input_dataset_from_rankings_files( - queries_filename, passages_filename - ) - def test_convert_generic_json_to_openai_rankings_format(self): generic_dataset = { "rows": [ diff --git a/genai-perf/tests/test_input_retriever_factory.py b/genai-perf/tests/test_input_retriever_factory.py index 8a8fd4d6..cb81b03f 100644 --- a/genai-perf/tests/test_input_retriever_factory.py +++ b/genai-perf/tests/test_input_retriever_factory.py @@ -12,20 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import namedtuple -from pathlib import Path -from unittest.mock import mock_open, patch +from unittest.mock import patch import pytest import responses from genai_perf.exceptions import GenAIPerfException from genai_perf.inputs import input_constants as ic -from genai_perf.inputs.input_constants import ModelSelectionStrategy, OutputFormat +from genai_perf.inputs.input_constants import OutputFormat from genai_perf.inputs.input_retriever_factory import InputRetrieverFactory from genai_perf.inputs.inputs_config import InputsConfig from genai_perf.inputs.synthetic_image_generator import ImageFormat from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer -from PIL import Image mocked_openorca_data = { "features": [ @@ -196,89 +193,3 @@ def test_inputs_with_defaults(self, default_configured_url): assert dataset_json is not None assert len(dataset_json["rows"]) == TEST_LENGTH - - def test_get_input_file_without_file_existing(self): - input_retriever_factory = InputRetrieverFactory( - InputsConfig(input_filename=Path("prompt.txt")) - ) - with pytest.raises(FileNotFoundError): - input_retriever_factory._get_input_dataset_from_file() - - @patch("pathlib.Path.exists", return_value=True) - @patch( - "builtins.open", - new_callable=mock_open, - read_data='{"text_input": "single prompt"}\n', - ) - def test_get_input_file_with_single_prompt(self, mock_file, mock_exists): - expected_prompts = ["single prompt"] - input_retriever_factory = InputRetrieverFactory( - InputsConfig( - model_name=["test_model_A"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - input_filename=Path("prompt.txt"), - ) - ) - dataset = input_retriever_factory._get_input_dataset_from_file() - - assert dataset is not None - assert len(dataset["rows"]) == len(expected_prompts) - for i, prompt in enumerate(expected_prompts): - assert dataset["rows"][i]["row"]["text_input"] == prompt - - @patch("pathlib.Path.exists", return_value=True) - @patch( - "builtins.open", - new_callable=mock_open, - read_data='{"text_input": "prompt1"}\n{"text_input": "prompt2"}\n{"text_input": "prompt3"}\n', - ) - def test_get_input_file_with_multiple_prompts(self, mock_file, mock_exists): - expected_prompts = ["prompt1", "prompt2", "prompt3"] - input_retriever_factory = InputRetrieverFactory( - InputsConfig( - model_name=["test_model_A"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - input_filename=Path("prompt.txt"), - ) - ) - dataset = input_retriever_factory._get_input_dataset_from_file() - - assert dataset is not None - assert len(dataset["rows"]) == len(expected_prompts) - for i, prompt in enumerate(expected_prompts): - assert dataset["rows"][i]["row"]["text_input"] == prompt - - @patch("pathlib.Path.exists", return_value=True) - @patch("PIL.Image.open", return_value=Image.new("RGB", (10, 10))) - @patch( - "builtins.open", - new_callable=mock_open, - read_data=( - '{"text_input": "prompt1", "image": "image1.png"}\n' - '{"text_input": "prompt2", "image": "image2.png"}\n' - '{"text_input": "prompt3", "image": "image3.png"}\n' - ), - ) - def test_get_input_file_with_multi_modal_data( - self, mock_exists, mock_image, mock_file - ): - input_retriever_factory = InputRetrieverFactory( - InputsConfig( - model_name=["test_model_A"], - model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, - input_filename=Path("prompt.txt"), - ) - ) - Data = namedtuple("Data", ["text_input", "image"]) - expected_data = [ - Data(text_input="prompt1", image="image1.png"), - Data(text_input="prompt2", image="image2.png"), - Data(text_input="prompt3", image="image3.png"), - ] - dataset = input_retriever_factory._get_input_dataset_from_file() - - assert dataset is not None - assert len(dataset["rows"]) == len(expected_data) - for i, data in enumerate(expected_data): - assert dataset["rows"][i]["row"]["text_input"] == data.text_input - assert dataset["rows"][i]["row"]["image"] == data.image