From f3c4f530c3777b916539afd6565c83440f53959d Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Tue, 2 Jul 2024 14:35:16 -0700 Subject: [PATCH 1/9] Guard GenAI-Perf plot generation (#732) --- .../genai-perf/genai_perf/main.py | 4 +- .../genai-perf/genai_perf/parser.py | 5 ++ .../genai-perf/tests/test_artifacts.py | 12 ++++- .../genai-perf/tests/test_cli.py | 52 +++++++++++++++++++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py index 7692d02e5..912ee4725 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py @@ -43,10 +43,10 @@ def create_artifacts_dirs(args: Namespace) -> None: - # TMA-1911: support plots CLI option plot_dir = args.artifact_dir / "plots" os.makedirs(args.artifact_dir, exist_ok=True) - os.makedirs(plot_dir, exist_ok=True) + if hasattr(args, "generate_plots") and args.generate_plots: + os.makedirs(plot_dir, exist_ok=True) def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 6c8dfe4cb..64178fd4c 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -178,6 +178,11 @@ def _check_conditional_args_embeddings_rankings( parser.error( f"The --streaming option is not supported with the {args.endpoint_type} endpoint type." ) + + if args.generate_plots: + parser.error( + f"The --generate-plots option is not currently supported with the {args.endpoint_type} endpoint type." + ) else: if args.batch_size != LlmInputs.DEFAULT_BATCH_SIZE: parser.error( diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py b/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py index 56b1b38de..cdcc4afc9 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_artifacts.py @@ -38,7 +38,7 @@ def mock_makedirs(mocker): def test_create_artifacts_dirs_custom_path(mock_makedirs): artifacts_dir_path = "/genai_perf_artifacts" - mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) + mock_args = Namespace(artifact_dir=Path(artifacts_dir_path), generate_plots=True) create_artifacts_dirs(mock_args) mock_makedirs.assert_any_call( Path(artifacts_dir_path), exist_ok=True @@ -47,3 +47,13 @@ def test_create_artifacts_dirs_custom_path(mock_makedirs): Path(artifacts_dir_path) / "plots", exist_ok=True ), f"Expected os.makedirs to create plots directory inside {artifacts_dir_path}/plots path." assert mock_makedirs.call_count == 2 + + +def test_create_artifacts_disable_generate_plots(mock_makedirs): + artifacts_dir_path = "/genai_perf_artifacts" + mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) + create_artifacts_dirs(mock_args) + mock_makedirs.assert_any_call( + Path(artifacts_dir_path), exist_ok=True + ), f"Expected os.makedirs to create artifacts directory inside {artifacts_dir_path} path." + assert mock_makedirs.call_count == 1 diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index bf8fd023e..cc005beef 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -515,6 +515,58 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ], "The --batch-size option is currently only supported with the embeddings and rankings endpoint types", ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + "--streaming", + ], + "The --streaming option is not supported with the embeddings endpoint type", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "rankings", + "--streaming", + ], + "The --streaming option is not supported with the rankings endpoint type", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + "--generate-plots", + ], + "The --generate-plots option is not currently supported with the embeddings endpoint type", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "rankings", + "--generate-plots", + ], + "The --generate-plots option is not currently supported with the rankings endpoint type", + ), ], ) def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): From d3fadc1f21860e9218061d387686937eb4af2bec Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Wed, 3 Jul 2024 09:26:43 -0700 Subject: [PATCH 2/9] Add support for Hugging Face Text Embeddings Interface's re-ranker API (#728) --- src/c++/perf_analyzer/genai-perf/README.md | 5 +- .../genai-perf/docs/embeddings.md | 10 +- src/c++/perf_analyzer/genai-perf/docs/lora.md | 6 +- .../perf_analyzer/genai-perf/docs/rankings.md | 100 ++++++++++++++++++ .../genai_perf/llm_inputs/llm_inputs.py | 44 ++++++-- .../llm_profile_data_parser.py | 2 + .../profile_data_parser.py | 5 +- .../tests/test_profile_data_parser.py | 77 +++++++++++++- 8 files changed, 226 insertions(+), 23 deletions(-) create mode 100644 src/c++/perf_analyzer/genai-perf/docs/rankings.md diff --git a/src/c++/perf_analyzer/genai-perf/README.md b/src/c++/perf_analyzer/genai-perf/README.md index d9f288996..9c553115d 100644 --- a/src/c++/perf_analyzer/genai-perf/README.md +++ b/src/c++/perf_analyzer/genai-perf/README.md @@ -373,7 +373,7 @@ model config to not echo the input tokens in the output. (default: tensorrtllm) Set a custom endpoint that differs from the OpenAI defaults. (default: `None`) -##### `--endpoint-type {chat,completions,embeddings}` +##### `--endpoint-type {chat,completions,embeddings,rankings}` The endpoint-type to send requests to on the server. This is only used with the `openai` service-kind. (default: `None`) @@ -400,7 +400,8 @@ URL of the endpoint to target for benchmarking. (default: `None`) The batch size of the requests GenAI-Perf should send. This is currently only supported with the [embeddings endpoint type](docs/embeddings.md). -(default: `1`) +(default: `1`) and +[rankings endpoint type](docs/rankings.md). ##### `--extra-inputs ` diff --git a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md b/src/c++/perf_analyzer/genai-perf/docs/embeddings.md index e61c397d9..bc6e2d413 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md +++ b/src/c++/perf_analyzer/genai-perf/docs/embeddings.md @@ -26,12 +26,12 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Profiling Embeddings Models with GenAI-Perf +# Profile Embeddings Models with GenAI-Perf GenAI-Perf allows you to profile embedding models running on an [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)-compatible server. -## Creating a Sample Embeddings Input File +## Create a Sample Embeddings Input File To create a sample embeddings input file, use the following command: @@ -50,13 +50,13 @@ This will generate a file named embeddings.jsonl with the following content: {"text": "In what state did they film Shrek 2?"} ``` -## Starting an OpenAI Embeddings-Compatible Server +## Start an OpenAI Embeddings-Compatible Server To start an OpenAI embeddings-compatible server, run the following command: ```bash docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model intfloat/e5-mistral-7b-instruct --dtype float16 --max-model-len 1024 ``` -## Running GenAI-Perf +## Run GenAI-Perf To profile embeddings models using GenAI-Perf, use the following command: ```bash @@ -90,4 +90,4 @@ Example output: │ Request latency (ms) │ 42.21 │ 28.18 │ 318.61 │ 56.50 │ 49.21 │ 43.07 │ └──────────────────────┴───────┴───────┴────────┴───────┴───────┴───────┘ Request throughput (per sec): 23.63 -``` \ No newline at end of file +``` diff --git a/src/c++/perf_analyzer/genai-perf/docs/lora.md b/src/c++/perf_analyzer/genai-perf/docs/lora.md index 60be30c95..b3ddbe479 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/lora.md +++ b/src/c++/perf_analyzer/genai-perf/docs/lora.md @@ -26,17 +26,17 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Profiling Multiple LoRA Adapters +# Profile Multiple LoRA Adapters GenAI-Perf allows you to profile multiple LoRA adapters on top of a base model. -## Selecting LoRA Adapters +## Select LoRA Adapters To do this, list multiple adapters after the model name option `-m`: ```bash genai-perf -m lora_adapter1 lora_adapter2 lora_adapter3 ``` -## Choosing a Strategy for Selecting Models +## Choose a Strategy for Selecting Models When profiling with multiple models, you can specify how the models should be assigned to prompts using the `--model-selection-strategy` option: diff --git a/src/c++/perf_analyzer/genai-perf/docs/rankings.md b/src/c++/perf_analyzer/genai-perf/docs/rankings.md new file mode 100644 index 000000000..d195b25db --- /dev/null +++ b/src/c++/perf_analyzer/genai-perf/docs/rankings.md @@ -0,0 +1,100 @@ + + +# Profile Ranking Models with GenAI-Perf + + +GenAI-Perf allows you to profile ranking models compatible with Hugging Face's +[Text Embeddings Interface's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). + +## Create a Sample Rankings Input Directory + +To create a sample rankings input directory, follow these steps: + +Create a directory called rankings_jsonl: +```bash +mkdir rankings_jsonl +``` + +Inside this directory, create a JSONL file named queries.jsonl with queries data: + +```bash +echo '{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"}' > rankings_jsonl/queries.jsonl +``` + +Create another JSONL file named passages.jsonl with passages data: + +```bash +echo '{"text": "Eric Anderson (born January 18, 1968) is an American sociologist and sexologist."} +{"text": "Kevin Loader is a British film and television producer."} +{"text": "Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia."} +{"text": "Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."}' > rankings_jsonl/passages.jsonl +``` + +## Start a Hugging Face Re-Ranker-Compatible Server +To start a Hugging Face re-ranker-compatible server, run the following commands: + +```bash +model=BAAI/bge-reranker-large +revision=refs/pr/4 +volume=$PWD/data + +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.3 --model-id $model --revision $revision +``` + +## Run GenAI-Perf +To profile ranking models using GenAI-Perf, use the following command: + +```bash +genai-perf \ + -m BAAI/bge-reranker-large \ + --service-kind openai \ + --endpoint-type rankings \ + --endpoint rerank \ + --input-file rankings_jsonl/ \ + -u localhost:8080 \ + --extra-inputs rankings:tei \ + --batch-size 2 +``` + +This command specifies the use of Hugging Face's ranking API with `--endpoint rerank` and `--extra-inputs rankings:tei`. + +Example output: + +``` + Rankings Metrics +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━┩ +│ Request latency (ms) │ 5.48 │ 2.50 │ 23.91 │ 10.27 │ 8.34 │ 6.07 │ +└──────────────────────┴──────┴──────┴───────┴───────┴──────┴──────┘ +Request throughput (per sec): 180.11 +``` diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index d7384f6b8..de528aac4 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -139,7 +139,7 @@ def create_llm_inputs( output_tokens_deterministic: If true, the output tokens will set the minimum and maximum tokens to be equivalent. batch_size: - The number of inputs per request (currently only used for v1/embeddings) + The number of inputs per request (currently only used for the embeddings and rankings endpoints) Required Synthetic Prompt Generation Parameters ----------------------------------------------- @@ -236,7 +236,7 @@ def get_generic_dataset_json( num_of_output_prompts: The number of synthetic output prompts to generate batch_size: - The number of inputs per request (currently only used for v1/embeddings) + The number of inputs per request (currently only used for the embeddings and rankings endpoints) input_filename: The path to the input file containing the prompts in JSONL format. Returns @@ -733,6 +733,16 @@ def _convert_generic_json_to_openai_embeddings_format( return pa_json + @classmethod + def contains_rankings_tei(cls, extra_inputs: Optional[Dict]) -> bool: + """ + Check if user specified that they are using the Hugging Face + Text Embeddings Interface for ranking models + """ + if extra_inputs and extra_inputs.get("rankings") == "tei": + return True + return False + @classmethod def _convert_generic_json_to_rankings_format( cls, @@ -742,6 +752,7 @@ def _convert_generic_json_to_rankings_format( model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, ) -> Dict[str, Any]: pa_json: Dict[str, Any] = {"data": []} + use_tei_format = cls.contains_rankings_tei(extra_inputs) for index, entry in enumerate(generic_dataset["rows"]): iter_model_name = cls._select_model_name( @@ -749,25 +760,36 @@ def _convert_generic_json_to_rankings_format( ) payload = entry.get("payload", {}) query_values = payload.get("query") - passage_values = payload.get("passages") + + if use_tei_format: + passage_values = payload.get("passages", []) + passage_values = [item.get("text", "") for item in passage_values] + else: + passage_values = payload.get("passages") if query_values is None: raise ValueError("Missing required fields 'query' in dataset entry") if passage_values is None: - raise ValueError("Missing required fields 'passages' in dataset entry") + raise ValueError( + f"Missing required fields '{'texts' if use_tei_format else 'passages'}' in dataset entry" + ) if not isinstance(passage_values, list): raise ValueError( - f"Required field 'query' must be a list (actual: {type(query_values)})" + f"Required field '{'texts' if use_tei_format else 'passages'}' must be a list (actual: {type(passage_values)})" ) - payload = { - "query": query_values, - "passages": passage_values, - "model": iter_model_name, - } + if use_tei_format: + payload = {"query": query_values["text"], "texts": passage_values} + else: + payload = { + "query": query_values, + "passages": passage_values, + "model": iter_model_name, + } for key, value in extra_inputs.items(): - payload[key] = value + if not (key == "rankings" and value == "tei"): + payload[key] = value pa_json["data"].append({"payload": [payload]}) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py index cbb2da5ee..d42c4fb63 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -218,6 +218,8 @@ def _get_openai_input_text(self, req_inputs: dict) -> str: return payload["messages"][0]["content"] elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: return payload["prompt"] + elif self._response_format == ResponseFormat.HUGGINGFACE_RANKINGS: + return payload["query"] else: raise ValueError( "Failed to parse OpenAI request input in profile export file." diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py index 7fa069fbb..d18d8f6fb 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py @@ -35,6 +35,7 @@ class ResponseFormat(Enum): + HUGGINGFACE_RANKINGS = auto() OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() OPENAI_EMBEDDINGS = auto() @@ -55,7 +56,9 @@ def __init__(self, filename: Path) -> None: def _get_profile_metadata(self, data: dict) -> None: self._service_kind = data["service_kind"] if self._service_kind == "openai": - if data["endpoint"] == "v1/chat/completions": + if data["endpoint"] == "rerank": + self._response_format = ResponseFormat.HUGGINGFACE_RANKINGS + elif data["endpoint"] == "v1/chat/completions": self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS elif data["endpoint"] == "v1/completions": self._response_format = ResponseFormat.OPENAI_COMPLETIONS diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py index e63643e39..fe303c514 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_profile_data_parser.py @@ -67,9 +67,12 @@ def write(self: Any, content: str) -> int: if filename == "embedding_profile_export.json": tmp_file = StringIO(json.dumps(self.embedding_profile_data)) return tmp_file - if filename == "ranking_profile_export.json": + elif filename == "ranking_profile_export.json": tmp_file = StringIO(json.dumps(self.ranking_profile_data)) return tmp_file + elif filename == "huggingface_ranking_profile_export.json": + tmp_file = StringIO(json.dumps(self.huggingface_ranking_profile_data)) + return tmp_file elif filename == "profile_export.csv": tmp_file = StringIO() tmp_file.write = write.__get__(tmp_file) @@ -220,3 +223,75 @@ def test_ranking_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore + + # ================================================ + # HUGGINGFACE RANKINGS API + # ================================================ + huggingface_ranking_profile_data = { + "service_kind": "openai", + "endpoint": "rerank", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"query":"What was the first car ever driven?","texts":["Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget.","Kevin Loader is a British film and television producer."]}' + }, + "response_timestamps": [3], + "response_outputs": [ + { + "response": '[{"index":0,"score":0.0032476764},{"index":1,"score":0.00036117696}]' + }, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"query":"In what state did they film Shrek 2?","texts":["Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia.","Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."]}' + }, + "response_timestamps": [5], + "response_outputs": [ + { + "response": '[{"index":0,"score":0.020177318},{"index":1,"score":0.01461567}]' + }, + ], + }, + ], + }, + ], + } + + def test_huggingface_ranking_profile_data( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + """Collect base metrics from HuggingFace ranking profile export data and check values. + + Metrics + * request latencies + - [3 - 1, 5 - 2] = [2, 3] + * request throughputs + - [2 / (5e-9 - 1e-9)] = [5e8] + """ + pd = ProfileDataParser(filename=Path("huggingface_ranking_profile_export.json")) + + # experiment 1 statistics + stats = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stats.metrics + stats_dict = stats.stats_dict + assert isinstance(metrics, Metrics) + + assert metrics.request_latencies == [2, 3] + assert metrics.request_throughputs == [pytest.approx(5e8)] + + assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore + assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore + assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore + + assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore From 44ed1215960e72cd241aef07ea6a33ce3d727265 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Wed, 3 Jul 2024 10:52:51 -0700 Subject: [PATCH 3/9] Remove unnecessary input text branch (#734) --- .../genai_perf/profile_data_parser/llm_profile_data_parser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py index d42c4fb63..cbb2da5ee 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -218,8 +218,6 @@ def _get_openai_input_text(self, req_inputs: dict) -> str: return payload["messages"][0]["content"] elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: return payload["prompt"] - elif self._response_format == ResponseFormat.HUGGINGFACE_RANKINGS: - return payload["query"] else: raise ValueError( "Failed to parse OpenAI request input in profile export file." From 989b3f7983115c87302cc3202bc6b9cb627f3aad Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Mon, 8 Jul 2024 15:36:04 -0700 Subject: [PATCH 4/9] Update version 0.0.4 (#739) * Update version 0.0.4 --- src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py index cb5c26999..5e15090be 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py @@ -24,4 +24,4 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -__version__ = "0.0.4dev" +__version__ = "0.0.4" From 21562b939407229889ea5cd598a9a7fd0b11dab1 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Mon, 8 Jul 2024 15:40:27 -0700 Subject: [PATCH 5/9] Revert "Update version 0.0.4 (#739)" (#740) This reverts commit 989b3f7983115c87302cc3202bc6b9cb627f3aad. --- src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py index 5e15090be..cb5c26999 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/__init__.py @@ -24,4 +24,4 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -__version__ = "0.0.4" +__version__ = "0.0.4dev" From e83862adf0054bb7db367589d6f4a42445d497c1 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Tue, 9 Jul 2024 15:14:47 -0700 Subject: [PATCH 6/9] Update the name of Hugging Face TEI (#744) --- src/c++/perf_analyzer/genai-perf/docs/rankings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c++/perf_analyzer/genai-perf/docs/rankings.md b/src/c++/perf_analyzer/genai-perf/docs/rankings.md index d195b25db..5cd1a4812 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/rankings.md +++ b/src/c++/perf_analyzer/genai-perf/docs/rankings.md @@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GenAI-Perf allows you to profile ranking models compatible with Hugging Face's -[Text Embeddings Interface's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). +[Text Embeddings Inference's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). ## Create a Sample Rankings Input Directory From cb5710c484ccb5cefbfe9a624e8e45eb4eba4eb6 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Wed, 10 Jul 2024 15:14:19 -0700 Subject: [PATCH 7/9] Clarify new arguments in documentation (#746) --- src/c++/perf_analyzer/docs/cli.md | 7 +++++++ src/c++/perf_analyzer/genai-perf/README.md | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/c++/perf_analyzer/docs/cli.md b/src/c++/perf_analyzer/docs/cli.md index 399596fd6..bd82415c8 100644 --- a/src/c++/perf_analyzer/docs/cli.md +++ b/src/c++/perf_analyzer/docs/cli.md @@ -157,6 +157,13 @@ will also be reported in the results. Default is `-1` indicating that the average latency is used to determine stability. +#### `--request-count=` + +Specifies a total number of requests to use for measurement. + +Default is `0`, which means that there is no request count and the measurement +will proceed using windows until stabilization is detected. + #### `-r ` #### `--max-trials=` diff --git a/src/c++/perf_analyzer/genai-perf/README.md b/src/c++/perf_analyzer/genai-perf/README.md index 9c553115d..45159cc15 100644 --- a/src/c++/perf_analyzer/genai-perf/README.md +++ b/src/c++/perf_analyzer/genai-perf/README.md @@ -301,8 +301,8 @@ options: When the dataset is coming from a file, you can specify the following options: -* `--input-file `: The input file containing the single prompt to - use for benchmarking. +* `--input-file `: The input file containing the prompts to + use for benchmarking as JSON objects. For any dataset, you can specify the following options: * `--output-tokens-mean `: The mean number of tokens in each output. Ensure From ade066d486498c5e5dd0cb701cca81bcaddb24c6 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:12:10 -0700 Subject: [PATCH 8/9] Move GenAI-Perf profiling to its own subcommand (#745) --- src/c++/perf_analyzer/genai-perf/README.md | 4 +- .../genai-perf/docs/embeddings.md | 4 +- src/c++/perf_analyzer/genai-perf/docs/lora.md | 2 +- .../perf_analyzer/genai-perf/docs/rankings.md | 2 +- .../perf_analyzer/genai-perf/docs/tutorial.md | 8 +- .../genai-perf/genai_perf/parser.py | 114 ++++++++++++------ .../genai-perf/genai_perf/test_end_to_end.py | 12 +- .../genai-perf/tests/test_cli.py | 74 +++++++----- .../genai-perf/tests/test_console_exporter.py | 3 + .../genai-perf/tests/test_csv_exporter.py | 3 + .../genai-perf/tests/test_json_exporter.py | 3 +- .../genai-perf/tests/test_wrapper.py | 28 ++++- 12 files changed, 169 insertions(+), 88 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/README.md b/src/c++/perf_analyzer/genai-perf/README.md index 45159cc15..1d03b3dd0 100644 --- a/src/c++/perf_analyzer/genai-perf/README.md +++ b/src/c++/perf_analyzer/genai-perf/README.md @@ -162,7 +162,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -209,7 +209,7 @@ current profile run. This is disabled by default but users can easily enable it by passing the `--generate-plots` option when running the benchmark: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ diff --git a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md b/src/c++/perf_analyzer/genai-perf/docs/embeddings.md index bc6e2d413..e508f9eff 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/embeddings.md +++ b/src/c++/perf_analyzer/genai-perf/docs/embeddings.md @@ -60,7 +60,7 @@ docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model intflo To profile embeddings models using GenAI-Perf, use the following command: ```bash -genai-perf \ +genai-perf profile \ -m intfloat/e5-mistral-7b-instruct \ --service-kind openai \ --endpoint-type embeddings \ @@ -73,7 +73,7 @@ additional arguments with the `--extra-inputs` [flag](../README.md#input-options For example, you could use this command: ```bash -genai-perf \ +genai-perf profile \ -m intfloat/e5-mistral-7b-instruct \ --service-kind openai \ --endpoint-type embeddings \ diff --git a/src/c++/perf_analyzer/genai-perf/docs/lora.md b/src/c++/perf_analyzer/genai-perf/docs/lora.md index b3ddbe479..d30867eda 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/lora.md +++ b/src/c++/perf_analyzer/genai-perf/docs/lora.md @@ -41,7 +41,7 @@ When profiling with multiple models, you can specify how the models should be assigned to prompts using the `--model-selection-strategy` option: ```bash -genai-perf \ +genai-perf profile \ -m lora_adapter1 lora_adapter2 lora_adapter3 \ --model-selection-strategy round_robin ``` diff --git a/src/c++/perf_analyzer/genai-perf/docs/rankings.md b/src/c++/perf_analyzer/genai-perf/docs/rankings.md index 5cd1a4812..a316ef857 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/rankings.md +++ b/src/c++/perf_analyzer/genai-perf/docs/rankings.md @@ -74,7 +74,7 @@ docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingf To profile ranking models using GenAI-Perf, use the following command: ```bash -genai-perf \ +genai-perf profile \ -m BAAI/bge-reranker-large \ --service-kind openai \ --endpoint-type rankings \ diff --git a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md index bc9dec71b..6d6f3e301 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md +++ b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md @@ -82,7 +82,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -166,7 +166,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend vllm \ @@ -232,7 +232,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind openai \ --endpoint v1/chat/completions \ @@ -296,7 +296,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind openai \ --endpoint v1/completions \ diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 64178fd4c..521b30e53 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -61,6 +61,14 @@ def to_lowercase(self): return self.name.lower() +class Subcommand(Enum): + PROFILE = auto() + COMPARE = auto() + + def to_lowercase(self): + return self.name.lower() + + logger = logging.getLogger(__name__) _endpoint_type_map = { @@ -77,7 +85,7 @@ def _check_model_args( """ Check if model name is provided. """ - if not args.subcommand and not args.model: + if not args.model: parser.error("The -m/--model option is required and cannot be empty.") args = _convert_str_to_enum_entry( args, "model_selection_strategy", ModelSelectionStrategy @@ -102,9 +110,8 @@ def _check_compare_args( """ Check compare subcommand args """ - if args.subcommand == "compare": - if not args.config and not args.files: - parser.error("Either the --config or --files option must be specified.") + if not args.config and not args.files: + parser.error("Either the --config or --files option must be specified.") return args @@ -573,13 +580,6 @@ def _add_other_args(parser): help="An option to enable verbose mode.", ) - other_group.add_argument( - "--version", - action="version", - version="%(prog)s " + __version__, - help=f"An option to print the version and exit.", - ) - def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: request_inputs = {} @@ -626,10 +626,10 @@ def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: def _parse_compare_args(subparsers) -> argparse.ArgumentParser: compare = subparsers.add_parser( - "compare", + Subcommand.COMPARE.to_lowercase(), description="Subcommand to generate plots that compare multiple profile runs.", ) - compare_group = compare.add_argument_group("Compare") + compare_group = compare.add_argument_group("Input") mx_group = compare_group.add_mutually_exclusive_group(required=False) mx_group.add_argument( "--config", @@ -651,6 +651,20 @@ def _parse_compare_args(subparsers) -> argparse.ArgumentParser: return compare +def _parse_profile_args(subparsers) -> argparse.ArgumentParser: + profile = subparsers.add_parser( + Subcommand.PROFILE.to_lowercase(), + description="Subcommand to profile LLMs and Generative AI models.", + ) + _add_endpoint_args(profile) + _add_input_args(profile) + _add_profile_args(profile) + _add_output_args(profile) + _add_other_args(profile) + profile.set_defaults(func=profile_handler) + return profile + + ### Handlers ### @@ -659,12 +673,6 @@ def create_compare_dir() -> None: os.mkdir(DEFAULT_COMPARE_DIR) -def profile_handler(args, extra_args): - from genai_perf.wrapper import Profiler - - Profiler.run(args=args, extra_args=extra_args) - - def compare_handler(args: argparse.Namespace): """Handles `compare` subcommand workflow.""" if args.files: @@ -679,45 +687,75 @@ def compare_handler(args: argparse.Namespace): plot_manager.generate_plots() -### Entrypoint ### +def profile_handler(args, extra_args): + from genai_perf.wrapper import Profiler + Profiler.run(args=args, extra_args=extra_args) -def parse_args(): - argv = sys.argv +### Parser Initialization ### + + +def init_parsers(): parser = argparse.ArgumentParser( prog="genai-perf", description="CLI to profile LLMs and Generative AI models with Perf Analyzer", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.set_defaults(func=profile_handler) - - # Conceptually group args for easier visualization - _add_endpoint_args(parser) - _add_input_args(parser) - _add_profile_args(parser) - _add_output_args(parser) - _add_other_args(parser) + parser.add_argument( + "--version", + action="version", + version="%(prog)s " + __version__, + help=f"An option to print the version and exit.", + ) # Add subcommands subparsers = parser.add_subparsers( help="List of subparser commands.", dest="subcommand" ) - compare_parser = _parse_compare_args(subparsers) + _ = _parse_compare_args(subparsers) + _ = _parse_profile_args(subparsers) + subparsers.required = True + + return parser - # Check for passthrough args + +def get_passthrough_args_index(argv: list) -> int: if "--" in argv: passthrough_index = argv.index("--") logger.info(f"Detected passthrough args: {argv[passthrough_index + 1:]}") else: passthrough_index = len(argv) + return passthrough_index + + +def refine_args( + parser: argparse.ArgumentParser, args: argparse.Namespace +) -> argparse.Namespace: + if args.subcommand == Subcommand.PROFILE.to_lowercase(): + args = _infer_prompt_source(args) + args = _check_model_args(parser, args) + args = _check_conditional_args(parser, args) + args = _check_load_manager_args(args) + args = _set_artifact_paths(args) + elif args.subcommand == Subcommand.COMPARE.to_lowercase(): + args = _check_compare_args(parser, args) + else: + raise ValueError(f"Unknown subcommand: {args.subcommand}") + + return args + + +### Entrypoint ### + + +def parse_args(): + argv = sys.argv + + parser = init_parsers() + passthrough_index = get_passthrough_args_index(argv) args = parser.parse_args(argv[1:passthrough_index]) - args = _infer_prompt_source(args) - args = _check_model_args(parser, args) - args = _check_conditional_args(parser, args) - args = _check_compare_args(compare_parser, args) - args = _check_load_manager_args(args) - args = _set_artifact_paths(args) + args = refine_args(parser, args) return args, argv[passthrough_index + 1 :] diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py b/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py index 3cc2999f5..a44304348 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/test_end_to_end.py @@ -10,7 +10,7 @@ # For all cases but vllm_openai, it assumes that the server will be on port 9999 # # This script will run a sweep of all combinations of values in the testing matrix -# by appending those options on to the genai-pa base command +# by appending those options on to the genai-perf base command # @@ -20,11 +20,11 @@ ] base_commands = { - "nim_chat": "genai-perf -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", - "nim_completions": "genai-perf -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", - "vllm_openai": "genai-perf -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", - "triton_tensorrtllm": "genai-perf -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", - "triton_vllm": "genai-perf -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", + "nim_chat": "genai-perf profile -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", + "nim_completions": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", + "vllm_openai": "genai-perf profile -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", + "triton_tensorrtllm": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", + "triton_vllm": "genai-perf profile -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", } testname = "" diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index cc005beef..eb891fd02 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -52,10 +52,7 @@ class TestCLIArguments: [ (["-h"], expected_help_output), (["--help"], expected_help_output), - (["-m", "abc", "--help"], expected_help_output), - (["-m", "abc", "-h"], expected_help_output), (["--version"], expected_version_output), - (["-m", "abc", "--version"], expected_version_output), ], ) def test_help_version_arguments_output_and_exit( @@ -226,7 +223,7 @@ def test_help_version_arguments_output_and_exit( ) def test_non_file_flags_parsed(self, monkeypatch, arg, expected_attributes, capsys): logging.init_logging() - combined_args = ["genai-perf", "--model", "test_model"] + arg + combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -267,7 +264,7 @@ def test_multiple_model_args( self, monkeypatch, models, expected_model_list, formatted_name, capsys ): logging.init_logging() - combined_args = ["genai-perf"] + models + combined_args = ["genai-perf", "profile"] + models monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -287,6 +284,7 @@ def test_file_flags_parsed(self, monkeypatch, mocker): _ = mocker.patch("os.path.isfile", return_value=True) combined_args = [ "genai-perf", + "profile", "--model", "test_model", "--input-file", @@ -340,7 +338,7 @@ def test_default_profile_export_filepath( self, monkeypatch, arg, expected_path, capsys ): logging.init_logging() - combined_args = ["genai-perf", "--model", "test_model"] + arg + combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -380,7 +378,7 @@ def test_model_name_artifact_path( self, monkeypatch, arg, expected_path, expected_output, capsys ): logging.init_logging() - combined_args = ["genai-perf"] + arg + combined_args = ["genai-perf", "profile"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -390,7 +388,9 @@ def test_model_name_artifact_path( def test_default_load_level(self, monkeypatch, capsys): logging.init_logging() - monkeypatch.setattr("sys.argv", ["genai-perf", "--model", "test_model"]) + monkeypatch.setattr( + "sys.argv", ["genai-perf", "profile", "--model", "test_model"] + ) args, _ = parser.parse_args() assert args.concurrency == 1 captured = capsys.readouterr() @@ -398,7 +398,8 @@ def test_default_load_level(self, monkeypatch, capsys): def test_load_level_mutually_exclusive(self, monkeypatch, capsys): monkeypatch.setattr( - "sys.argv", ["genai-perf", "--concurrency", "3", "--request-rate", "9.0"] + "sys.argv", + ["genai-perf", "profile", "--concurrency", "3", "--request-rate", "9.0"], ) expected_output = ( "argument --request-rate: not allowed with argument --concurrency" @@ -412,7 +413,7 @@ def test_load_level_mutually_exclusive(self, monkeypatch, capsys): assert expected_output in captured.err def test_model_not_provided(self, monkeypatch, capsys): - monkeypatch.setattr("sys.argv", ["genai-perf"]) + monkeypatch.setattr("sys.argv", ["genai-perf", "profile"]) expected_output = "The -m/--model option is required and cannot be empty." with pytest.raises(SystemExit) as excinfo: @@ -423,7 +424,7 @@ def test_model_not_provided(self, monkeypatch, capsys): assert expected_output in captured.err def test_pass_through_args(self, monkeypatch): - args = ["genai-perf", "-m", "test_model"] + args = ["genai-perf", "profile", "-m", "test_model"] other_args = ["--", "With", "great", "power"] monkeypatch.setattr("sys.argv", args + other_args) _, pass_through_args = parser.parse_args() @@ -435,6 +436,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "sys.argv", [ "genai-perf", + "profile", "-m", "nonexistent_model", "--wrong-arg", @@ -453,12 +455,20 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "args, expected_output", [ ( - ["genai-perf", "-m", "test_model", "--service-kind", "openai"], + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + ], "The --endpoint-type option is required when using the 'openai' service-kind.", ), ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -469,12 +479,20 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "The --endpoint-type option is required when using the 'openai' service-kind.", ), ( - ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"], + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--output-tokens-stddev", + "5", + ], "The --output-tokens-mean option is required when using --output-tokens-stddev.", ), ( [ "genai-perf", + "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -484,6 +502,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -493,6 +512,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -508,6 +528,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--batch-size", @@ -518,6 +539,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -531,6 +553,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -544,6 +567,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -557,6 +581,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -613,7 +638,9 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): ], ) def test_inferred_output_format(self, monkeypatch, args, expected_format): - monkeypatch.setattr("sys.argv", ["genai-perf", "-m", "test_model"] + args) + monkeypatch.setattr( + "sys.argv", ["genai-perf", "profile", "-m", "test_model"] + args + ) parsed_args, _ = parser.parse_args() assert parsed_args.output_format == expected_format @@ -644,7 +671,7 @@ def test_inferred_output_format(self, monkeypatch, args, expected_format): ], ) def test_repeated_extra_arg_warning(self, monkeypatch, args, expected_error): - combined_args = ["genai-perf", "-m", "test_model"] + args + combined_args = ["genai-perf", "profile", "-m", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) parsed_args, _ = parser.parse_args() @@ -672,7 +699,7 @@ def test_inferred_prompt_source( _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) _ = mocker.patch("os.path.isfile", return_value=True) _ = mocker.patch("os.path.isdir", return_value=True) - combined_args = ["genai-perf", "--model", "test_model"] + args + combined_args = ["genai-perf", "profile", "--model", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -684,6 +711,7 @@ def test_prompt_source_assertions(self, monkeypatch, mocker, capsys): _ = mocker.patch("os.path.isdir", return_value=True) args = [ "genai-perf", + "profile", "--model", "test_model", "--input-dataset", @@ -758,20 +786,6 @@ def test_compare_not_provided(self, monkeypatch, capsys): captured = capsys.readouterr() assert expected_output in captured.err - @pytest.mark.parametrize( - "args, expected_model", - [ - (["--files", "profile1.json", "profile2.json", "profile3.json"], None), - (["--config", "config.yaml"], None), - ], - ) - def test_compare_model_arg(self, monkeypatch, args, expected_model): - combined_args = ["genai-perf", "compare"] + args - monkeypatch.setattr("sys.argv", combined_args) - args, _ = parser.parse_args() - - assert args.model == expected_model - @pytest.mark.parametrize( "extra_inputs_list, expected_dict", [ diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py index ca11377ed..dda62e04a 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_console_exporter.py @@ -35,6 +35,7 @@ class TestConsoleExporter: def test_streaming_llm_output(self, monkeypatch, capsys) -> None: argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", @@ -86,6 +87,7 @@ def test_streaming_llm_output(self, monkeypatch, capsys) -> None: def test_nonstreaming_llm_output(self, monkeypatch, capsys) -> None: argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", @@ -135,6 +137,7 @@ def test_nonstreaming_llm_output(self, monkeypatch, capsys) -> None: def test_embedding_output(self, monkeypatch, capsys) -> None: argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py index bd2d3bb81..6a60bc2dc 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_csv_exporter.py @@ -71,6 +71,7 @@ def test_streaming_llm_csv_output( """ argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", @@ -126,6 +127,7 @@ def test_nonstreaming_llm_csv_output( """ argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", @@ -174,6 +176,7 @@ def test_embedding_csv_output( ) -> None: argv = [ "genai-perf", + "profile", "-m", "model_name", "--service-kind", diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py index 998cc8865..e4a29267d 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py @@ -35,6 +35,7 @@ class TestJsonExporter: def test_generate_json(self, monkeypatch) -> None: cli_cmd = [ "genai-perf", + "profile", "-m", "gpt2_vllm", "--backend", @@ -257,7 +258,7 @@ def test_generate_json(self, monkeypatch) -> None: "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1", "tokenizer": "hf-internal-testing/llama-tokenizer", "verbose": false, - "subcommand": null, + "subcommand": "profile", "prompt_source": "synthetic", "extra_inputs": { "max_tokens": 256, diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py index 184a47f11..fd4c34b51 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py @@ -43,7 +43,14 @@ class TestWrapper: ], ) def test_url_exactly_once_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -70,7 +77,14 @@ def test_url_exactly_once_triton(self, monkeypatch, arg): ], ) def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -87,7 +101,14 @@ def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): ], ) def test_service_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -111,6 +132,7 @@ def test_service_triton(self, monkeypatch, arg): def test_service_openai(self, monkeypatch, arg): args = [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", From f1803604ebf6679346b1971e66bf0b5533ee8ce0 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:00:16 -0700 Subject: [PATCH 9/9] When JSON parsing fails, return the failed string (#750) --- .../genai_perf/llm_inputs/llm_inputs.py | 11 ++++++----- .../genai-perf/genai_perf/parser.py | 2 +- .../llm_profile_data_parser.py | 8 ++++---- .../perf_analyzer/genai-perf/genai_perf/utils.py | 16 +++++++++++++++- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index de528aac4..39abc7ece 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -24,6 +24,7 @@ from genai_perf.exceptions import GenAIPerfException from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer +from genai_perf.utils import load_json_str from requests import Response @@ -315,7 +316,7 @@ def _get_input_dataset_from_embeddings_file( cls, input_filename: Path, batch_size: int, num_prompts: int ) -> Dict[str, Any]: with open(input_filename, "r") as file: - file_content = [json.loads(line) for line in file] + file_content = [load_json_str(line) for line in file] texts = [item["text"] for item in file_content] @@ -344,11 +345,11 @@ def _get_input_dataset_from_rankings_files( ) -> Dict[str, Any]: with open(queries_filename, "r") as file: - queries_content = [json.loads(line) for line in file] + queries_content = [load_json_str(line) for line in file] queries_texts = [item for item in queries_content] with open(passages_filename, "r") as file: - passages_content = [json.loads(line) for line in file] + passages_content = [load_json_str(line) for line in file] passages_texts = [item for item in passages_content] if batch_size > len(passages_texts): @@ -363,7 +364,7 @@ def _get_input_dataset_from_rankings_files( for _ in range(num_prompts): sampled_texts = random.sample(passages_texts, batch_size) query_sample = random.choice(queries_texts) - entry_dict = {} + entry_dict: Dict = {} entry_dict["query"] = query_sample entry_dict["passages"] = sampled_texts dataset_json["rows"].append({"row": {"payload": entry_dict}}) @@ -536,7 +537,7 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: with open(input_filename, mode="r", newline=None) as file: for line in file: if line.strip(): - prompts.append(json.loads(line).get("text_input", "").strip()) + prompts.append(load_json_str(line).get("text_input", "").strip()) return prompts @classmethod diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 521b30e53..901cf6ca2 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -586,7 +586,7 @@ def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: if args.extra_inputs: for input_str in args.extra_inputs: if input_str.startswith("{") and input_str.endswith("}"): - request_inputs.update(json.loads(input_str)) + request_inputs.update(utils.load_json_str(input_str)) else: semicolon_count = input_str.count(":") if semicolon_count != 1: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py index cbb2da5ee..4ec1bec62 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -37,7 +37,7 @@ ResponseFormat, ) from genai_perf.tokenizer import Tokenizer -from genai_perf.utils import remove_sse_prefix +from genai_perf.utils import load_json_str, remove_sse_prefix class LLMProfileDataParser(ProfileDataParser): @@ -178,7 +178,7 @@ def _preprocess_response( response = res_outputs[i]["response"] responses = response.strip().split("\n\n") if len(responses) > 1: - merged_response = json.loads(remove_sse_prefix(responses[0])) + merged_response = load_json_str(remove_sse_prefix(responses[0])) if ( merged_response["choices"][0]["delta"].get("content", None) is None @@ -213,7 +213,7 @@ def _get_input_token_count(self, req_inputs: dict) -> int: def _get_openai_input_text(self, req_inputs: dict) -> str: """Tokenize the OpenAI request input texts.""" - payload = json.loads(req_inputs["payload"]) + payload = load_json_str(req_inputs["payload"]) if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: return payload["messages"][0]["content"] elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: @@ -268,7 +268,7 @@ def _extract_openai_text_output(self, response: str) -> str: if response == "[DONE]": return "" - data = json.loads(response) + data = load_json_str(response) completions = data["choices"][0] text_output = "" diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py b/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py index a10befe13..6f66230c4 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/utils.py @@ -29,10 +29,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Type +import genai_perf.logging as logging + # Skip type checking to avoid mypy error # Issue: https://github.com/python/mypy/issues/10632 import yaml # type: ignore +logger = logging.getLogger(__name__) + def remove_sse_prefix(msg: str) -> str: prefix = "data: " @@ -49,7 +53,17 @@ def load_yaml(filepath: Path) -> Dict[str, Any]: def load_json(filepath: Path) -> Dict[str, Any]: with open(str(filepath), encoding="utf-8", errors="ignore") as f: - return json.load(f) + content = f.read() + return load_json_str(content) + + +def load_json_str(json_str: str) -> Dict[str, Any]: + try: + return json.loads(json_str) + except json.JSONDecodeError: + snippet = json_str[:200] + ("..." if len(json_str) > 200 else "") + logger.error("Failed to parse JSON string: '%s'", snippet) + raise def remove_file(file: Path) -> None: