From c82291744eabc2c3d45ffab39113581a72daed91 Mon Sep 17 00:00:00 2001 From: Ganesh Kudleppanavar <158088791+ganeshku1@users.noreply.github.com> Date: Fri, 10 May 2024 19:27:46 -0500 Subject: [PATCH 01/15] Revert "Changes to support Ensemble Top Level Response Caching (#560) (#642)" This reverts commit cc6a3b27b7d281079e6baa2f2ad7bb0362ed79fe. --- src/c++/perf_analyzer/inference_profiler.cc | 58 +++---------------- src/c++/perf_analyzer/inference_profiler.h | 29 ---------- src/c++/perf_analyzer/model_parser.cc | 4 -- src/c++/perf_analyzer/model_parser.h | 23 +------- .../perf_analyzer/test_inference_profiler.cc | 26 --------- 5 files changed, 10 insertions(+), 130 deletions(-) diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index 4d6af44b6..d0e2f179e 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -107,14 +107,6 @@ EnsembleDurations GetTotalEnsembleDurations(const ServerSideStats& stats) { EnsembleDurations result; - // Calculate avg cache hit latency and cache miss latency for ensemble model - // in case top level response caching is enabled. - const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count; - const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count; - result.total_cache_hit_time_avg_us += - AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt); - result.total_cache_miss_time_avg_us += - AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt); for (const auto& model_stats : stats.composing_models_stat) { if (model_stats.second.composing_models_stat.empty()) { // Cache hit count covers cache hits, not related to compute times @@ -246,6 +238,7 @@ ReportServerSideStats( if (parser->ResponseCacheEnabled()) { const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us); + std::cout << " (overhead " << overhead_avg_us << " usec + " << "queue " << queue_avg_us << " usec + " << "cache hit/miss " << combined_cache_compute_avg_us @@ -290,18 +283,12 @@ ReportServerSideStats( const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, ensemble_times.total_queue_time_avg_us, ensemble_times.total_combined_cache_compute_time_avg_us); - // FIXME - Refactor these calculations in case of ensemble top level - // response cache is enabled - if (!parser->TopLevelResponseCachingEnabled()) { - std::cout << " (overhead " << overhead_avg_us << " usec + " - << "queue " << ensemble_times.total_queue_time_avg_us - << " usec + " - << "cache hit/miss " - << ensemble_times.total_combined_cache_compute_time_avg_us - << " usec)" << std::endl; - } else { - std::cout << std::endl; - } + std::cout << " (overhead " << overhead_avg_us << " usec + " + << "queue " << ensemble_times.total_queue_time_avg_us + << " usec + " + << "cache hit/miss " + << ensemble_times.total_combined_cache_compute_time_avg_us + << " usec)" << std::endl; std::cout << ident << ident << " Average Cache Hit Latency: " << ensemble_times.total_cache_hit_time_avg_us << " usec" << std::endl; @@ -1563,21 +1550,6 @@ InferenceProfiler::DetermineStatsModelVersion( return cb::Error::Success; } -// Only for unit-testing -#ifndef DOCTEST_CONFIG_DISABLE -cb::Error -InferenceProfiler::SetTopLevelResponseCaching( - bool enable_top_level_response_caching) -{ - parser_ = std::make_shared(cb::BackendKind::TRITON); - if (parser_ == nullptr) { - return cb::Error("Failed to initialize ModelParser"); - } - parser_->SetTopLevelResponseCaching(enable_top_level_response_caching); - return cb::Error::Success; -} -#endif - cb::Error InferenceProfiler::SummarizeServerStats( const std::map& start_status, @@ -1633,20 +1605,8 @@ InferenceProfiler::SummarizeServerStatsHelper( const auto& end_itr = end_status.find(this_id); if (end_itr == end_status.end()) { - // In case of ensemble models, if top level response caching is enabled, - // the composing models statistics are unavailable in case of a cache hit. - // This is due to the scheduler sends cache response and composing models do - // not get executed. It's a valid scenario and shouldn't throw error. - bool stats_not_found_and_invalid = - model_version == -1 && !parser_->TopLevelResponseCachingEnabled(); - if (stats_not_found_and_invalid) { - return cb::Error( - "missing statistics for requested model", pa::GENERIC_ERROR); - } else { - // Setting server stats 0 for composing model in case of ensemble request - // cache hit since the composing model will not be executed - server_stats->Reset(); - } + return cb::Error( + "missing statistics for requested model", pa::GENERIC_ERROR); } else { uint64_t start_infer_cnt = 0; uint64_t start_exec_cnt = 0; diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index 013dd0483..37c9b8397 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -52,7 +52,6 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class NaggyMockInferenceProfiler; class TestInferenceProfiler; -class ModelParser; #endif /// Constant parameters that determine the whether stopping criteria has met @@ -120,28 +119,6 @@ struct ServerSideStats { uint64_t cache_miss_time_ns; std::map composing_models_stat; - // This function sets composing model server stats to 0 in case of a cache hit - // when top level response cache is enabled, since composing models are not - // executed and do not have any stats - void Reset() - { - inference_count = 0; - execution_count = 0; - success_count = 0; - queue_count = 0; - compute_input_count = 0; - compute_infer_count = 0; - compute_output_count = 0; - cumm_time_ns = 0; - queue_time_ns = 0; - compute_input_time_ns = 0; - compute_infer_time_ns = 0; - compute_output_time_ns = 0; - cache_hit_count = 0; - cache_hit_time_ns = 0; - cache_miss_count = 0; - cache_miss_time_ns = 0; - } }; /// Holds the statistics recorded at the client side. @@ -576,17 +553,12 @@ class InferenceProfiler { /// measurement /// \param end_stats The stats for all models at the end of the measurement /// \param model_version The determined model version - cb::Error DetermineStatsModelVersion( const cb::ModelIdentifier& model_identifier, const std::map& start_stats, const std::map& end_stats, int64_t* model_version); -#ifndef DOCTEST_CONFIG_DISABLE - cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching); -#endif - /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param server_stats Returns the summary that the fields recorded by server @@ -789,7 +761,6 @@ class InferenceProfiler { #ifndef DOCTEST_CONFIG_DISABLE friend NaggyMockInferenceProfiler; friend TestInferenceProfiler; - friend ModelParser; public: InferenceProfiler() = default; diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc index 8ffea56da..1ab9f7a6d 100644 --- a/src/c++/perf_analyzer/model_parser.cc +++ b/src/c++/perf_analyzer/model_parser.cc @@ -169,10 +169,6 @@ ModelParser::InitTriton( response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } - if (cache_itr != config.MemberEnd()) { - top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool(); - } - return cb::Error::Success; } diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h index ac76b3e22..c1400d079 100644 --- a/src/c++/perf_analyzer/model_parser.h +++ b/src/c++/perf_analyzer/model_parser.h @@ -35,7 +35,6 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class TestModelParser; class MockModelParser; -class InferenceProfiler; #endif struct ModelTensor { @@ -74,8 +73,7 @@ class ModelParser { outputs_(std::make_shared()), composing_models_map_(std::make_shared()), scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false), - response_cache_enabled_(false), - top_level_response_caching_enabled_(false) + response_cache_enabled_(false) { } @@ -153,22 +151,6 @@ class ModelParser { /// model bool ResponseCacheEnabled() const { return response_cache_enabled_; } - /// Returns whether or not top level request caching is enabled for this model - /// \return the truth value of whether top level request caching is enabled - /// for this model - bool TopLevelResponseCachingEnabled() const - { - return top_level_response_caching_enabled_; - } - -/// Only for testing -#ifndef DOCTEST_CONFIG_DISABLE - void SetTopLevelResponseCaching(bool enable_top_level_response_caching) - { - top_level_response_caching_enabled_ = enable_top_level_response_caching; - } -#endif - /// Get the details about the model inputs. /// \return The map with tensor_name and the tensor details /// stored as key-value pair. @@ -187,7 +169,6 @@ class ModelParser { return composing_models_map_; } - protected: ModelSchedulerType scheduler_type_; bool is_decoupled_; @@ -239,12 +220,10 @@ class ModelParser { std::string model_signature_name_; size_t max_batch_size_; bool response_cache_enabled_; - bool top_level_response_caching_enabled_; #ifndef DOCTEST_CONFIG_DISABLE friend TestModelParser; friend MockModelParser; - friend InferenceProfiler; public: ModelParser() = default; diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc index 8ff39605b..683219f15 100644 --- a/src/c++/perf_analyzer/test_inference_profiler.cc +++ b/src/c++/perf_analyzer/test_inference_profiler.cc @@ -160,15 +160,8 @@ class TestInferenceProfiler : public InferenceProfiler { return InferenceProfiler::DetermineStatsModelVersion( model_identifier, start_stats, end_stats, model_version); } - - cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching) - { - return InferenceProfiler::SetTopLevelResponseCaching( - enable_top_level_response_caching); - } }; - TEST_CASE("testing the ValidLatencyMeasurement function") { size_t valid_sequence_count{}; @@ -857,25 +850,6 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()") expect_exception = true; } - SUBCASE("One entry - version -1 - valid and in start") - { - model_identifier = {"ModelA", "-1"}; - start_stats_map.insert({{"ModelA", "3"}, old_stats}); - end_stats_map.insert({{"ModelA", "3"}, new_stats}); - cb::Error status = tip.SetTopLevelResponseCaching(true); - CHECK(status.IsOk()); - expected_model_version = -1; - } - - SUBCASE("One entry - version -1 - not valid") - { - model_identifier = {"ModelA", "-1"}; - end_stats_map.insert({{"ModelA", "3"}, old_stats}); - cb::Error status = tip.SetTopLevelResponseCaching(false); - CHECK(status.IsOk()); - expected_model_version = -1; - expect_exception = true; - } std::stringstream captured_cerr; std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf()); From fddba6d38ff78b8500af558ef0c86e95fcf84c07 Mon Sep 17 00:00:00 2001 From: Harshini Komali <157742537+lkomali@users.noreply.github.com> Date: Thu, 9 May 2024 13:38:45 -0700 Subject: [PATCH 02/15] Changes to support Ensemble Top Level Response Caching (#560) --- src/c++/perf_analyzer/inference_profiler.cc | 58 ++++++++++++++++--- src/c++/perf_analyzer/inference_profiler.h | 29 ++++++++++ src/c++/perf_analyzer/model_parser.cc | 4 ++ src/c++/perf_analyzer/model_parser.h | 23 +++++++- .../perf_analyzer/test_inference_profiler.cc | 26 +++++++++ 5 files changed, 130 insertions(+), 10 deletions(-) diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index d0e2f179e..4d6af44b6 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -107,6 +107,14 @@ EnsembleDurations GetTotalEnsembleDurations(const ServerSideStats& stats) { EnsembleDurations result; + // Calculate avg cache hit latency and cache miss latency for ensemble model + // in case top level response caching is enabled. + const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count; + const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count; + result.total_cache_hit_time_avg_us += + AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt); + result.total_cache_miss_time_avg_us += + AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt); for (const auto& model_stats : stats.composing_models_stat) { if (model_stats.second.composing_models_stat.empty()) { // Cache hit count covers cache hits, not related to compute times @@ -238,7 +246,6 @@ ReportServerSideStats( if (parser->ResponseCacheEnabled()) { const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " << "queue " << queue_avg_us << " usec + " << "cache hit/miss " << combined_cache_compute_avg_us @@ -283,12 +290,18 @@ ReportServerSideStats( const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, ensemble_times.total_queue_time_avg_us, ensemble_times.total_combined_cache_compute_time_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " - << "queue " << ensemble_times.total_queue_time_avg_us - << " usec + " - << "cache hit/miss " - << ensemble_times.total_combined_cache_compute_time_avg_us - << " usec)" << std::endl; + // FIXME - Refactor these calculations in case of ensemble top level + // response cache is enabled + if (!parser->TopLevelResponseCachingEnabled()) { + std::cout << " (overhead " << overhead_avg_us << " usec + " + << "queue " << ensemble_times.total_queue_time_avg_us + << " usec + " + << "cache hit/miss " + << ensemble_times.total_combined_cache_compute_time_avg_us + << " usec)" << std::endl; + } else { + std::cout << std::endl; + } std::cout << ident << ident << " Average Cache Hit Latency: " << ensemble_times.total_cache_hit_time_avg_us << " usec" << std::endl; @@ -1550,6 +1563,21 @@ InferenceProfiler::DetermineStatsModelVersion( return cb::Error::Success; } +// Only for unit-testing +#ifndef DOCTEST_CONFIG_DISABLE +cb::Error +InferenceProfiler::SetTopLevelResponseCaching( + bool enable_top_level_response_caching) +{ + parser_ = std::make_shared(cb::BackendKind::TRITON); + if (parser_ == nullptr) { + return cb::Error("Failed to initialize ModelParser"); + } + parser_->SetTopLevelResponseCaching(enable_top_level_response_caching); + return cb::Error::Success; +} +#endif + cb::Error InferenceProfiler::SummarizeServerStats( const std::map& start_status, @@ -1605,8 +1633,20 @@ InferenceProfiler::SummarizeServerStatsHelper( const auto& end_itr = end_status.find(this_id); if (end_itr == end_status.end()) { - return cb::Error( - "missing statistics for requested model", pa::GENERIC_ERROR); + // In case of ensemble models, if top level response caching is enabled, + // the composing models statistics are unavailable in case of a cache hit. + // This is due to the scheduler sends cache response and composing models do + // not get executed. It's a valid scenario and shouldn't throw error. + bool stats_not_found_and_invalid = + model_version == -1 && !parser_->TopLevelResponseCachingEnabled(); + if (stats_not_found_and_invalid) { + return cb::Error( + "missing statistics for requested model", pa::GENERIC_ERROR); + } else { + // Setting server stats 0 for composing model in case of ensemble request + // cache hit since the composing model will not be executed + server_stats->Reset(); + } } else { uint64_t start_infer_cnt = 0; uint64_t start_exec_cnt = 0; diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index 37c9b8397..013dd0483 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -52,6 +52,7 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class NaggyMockInferenceProfiler; class TestInferenceProfiler; +class ModelParser; #endif /// Constant parameters that determine the whether stopping criteria has met @@ -119,6 +120,28 @@ struct ServerSideStats { uint64_t cache_miss_time_ns; std::map composing_models_stat; + // This function sets composing model server stats to 0 in case of a cache hit + // when top level response cache is enabled, since composing models are not + // executed and do not have any stats + void Reset() + { + inference_count = 0; + execution_count = 0; + success_count = 0; + queue_count = 0; + compute_input_count = 0; + compute_infer_count = 0; + compute_output_count = 0; + cumm_time_ns = 0; + queue_time_ns = 0; + compute_input_time_ns = 0; + compute_infer_time_ns = 0; + compute_output_time_ns = 0; + cache_hit_count = 0; + cache_hit_time_ns = 0; + cache_miss_count = 0; + cache_miss_time_ns = 0; + } }; /// Holds the statistics recorded at the client side. @@ -553,12 +576,17 @@ class InferenceProfiler { /// measurement /// \param end_stats The stats for all models at the end of the measurement /// \param model_version The determined model version + cb::Error DetermineStatsModelVersion( const cb::ModelIdentifier& model_identifier, const std::map& start_stats, const std::map& end_stats, int64_t* model_version); +#ifndef DOCTEST_CONFIG_DISABLE + cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching); +#endif + /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param server_stats Returns the summary that the fields recorded by server @@ -761,6 +789,7 @@ class InferenceProfiler { #ifndef DOCTEST_CONFIG_DISABLE friend NaggyMockInferenceProfiler; friend TestInferenceProfiler; + friend ModelParser; public: InferenceProfiler() = default; diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc index 1ab9f7a6d..8ffea56da 100644 --- a/src/c++/perf_analyzer/model_parser.cc +++ b/src/c++/perf_analyzer/model_parser.cc @@ -169,6 +169,10 @@ ModelParser::InitTriton( response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } + if (cache_itr != config.MemberEnd()) { + top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool(); + } + return cb::Error::Success; } diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h index c1400d079..ac76b3e22 100644 --- a/src/c++/perf_analyzer/model_parser.h +++ b/src/c++/perf_analyzer/model_parser.h @@ -35,6 +35,7 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class TestModelParser; class MockModelParser; +class InferenceProfiler; #endif struct ModelTensor { @@ -73,7 +74,8 @@ class ModelParser { outputs_(std::make_shared()), composing_models_map_(std::make_shared()), scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false), - response_cache_enabled_(false) + response_cache_enabled_(false), + top_level_response_caching_enabled_(false) { } @@ -151,6 +153,22 @@ class ModelParser { /// model bool ResponseCacheEnabled() const { return response_cache_enabled_; } + /// Returns whether or not top level request caching is enabled for this model + /// \return the truth value of whether top level request caching is enabled + /// for this model + bool TopLevelResponseCachingEnabled() const + { + return top_level_response_caching_enabled_; + } + +/// Only for testing +#ifndef DOCTEST_CONFIG_DISABLE + void SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + top_level_response_caching_enabled_ = enable_top_level_response_caching; + } +#endif + /// Get the details about the model inputs. /// \return The map with tensor_name and the tensor details /// stored as key-value pair. @@ -169,6 +187,7 @@ class ModelParser { return composing_models_map_; } + protected: ModelSchedulerType scheduler_type_; bool is_decoupled_; @@ -220,10 +239,12 @@ class ModelParser { std::string model_signature_name_; size_t max_batch_size_; bool response_cache_enabled_; + bool top_level_response_caching_enabled_; #ifndef DOCTEST_CONFIG_DISABLE friend TestModelParser; friend MockModelParser; + friend InferenceProfiler; public: ModelParser() = default; diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc index 683219f15..8ff39605b 100644 --- a/src/c++/perf_analyzer/test_inference_profiler.cc +++ b/src/c++/perf_analyzer/test_inference_profiler.cc @@ -160,8 +160,15 @@ class TestInferenceProfiler : public InferenceProfiler { return InferenceProfiler::DetermineStatsModelVersion( model_identifier, start_stats, end_stats, model_version); } + + cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + return InferenceProfiler::SetTopLevelResponseCaching( + enable_top_level_response_caching); + } }; + TEST_CASE("testing the ValidLatencyMeasurement function") { size_t valid_sequence_count{}; @@ -850,6 +857,25 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()") expect_exception = true; } + SUBCASE("One entry - version -1 - valid and in start") + { + model_identifier = {"ModelA", "-1"}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(true); + CHECK(status.IsOk()); + expected_model_version = -1; + } + + SUBCASE("One entry - version -1 - not valid") + { + model_identifier = {"ModelA", "-1"}; + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(false); + CHECK(status.IsOk()); + expected_model_version = -1; + expect_exception = true; + } std::stringstream captured_cerr; std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf()); From 61ae566ad7fe750c6537bea1488cdd0fe02e92b8 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 24 May 2024 10:24:21 -0700 Subject: [PATCH 03/15] initial generate support --- .../client_backend/openai/openai_client.cc | 29 ++++- .../client_backend/openai/openai_client.h | 3 +- .../genai_perf/llm_inputs/llm_inputs.py | 104 +++++++++++++++++- .../genai-perf/genai_perf/llm_metrics.py | 25 ++++- .../genai-perf/genai_perf/parser.py | 8 +- 5 files changed, 156 insertions(+), 13 deletions(-) diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index cd517f6a6..7f503e947 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -63,6 +63,14 @@ namespace openai { void ChatCompletionRequest::SendResponse(bool is_final, bool is_null) { + // if final response has already been sent + // due to detecting the [DONE] + // ignore final response due to request completion + if (final_response_sent_) { + return; + } + + final_response_sent_ = is_final; response_callback_(new ChatCompletionResult( http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); } @@ -107,6 +115,7 @@ ChatCompletionClient::ResponseHeaderHandler( hdr.find("text/event-stream") != std::string::npos) { request->is_stream_ = true; } + return byte_size; } @@ -114,6 +123,7 @@ size_t ChatCompletionClient::ResponseHandler( void* contents, size_t size, size_t nmemb, void* userp) { + // [TODO TMA-1666] verify if the SSE responses received are complete, or the // response need to be stitched first. To verify, print out the received // responses from SendResponse() to make sure the OpenAI server doesn't chunk @@ -151,7 +161,7 @@ ChatCompletionClient::ResponseHandler( // RECV_END so that we always have the time of the last. request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::RECV_END); - + return result_bytes; } @@ -162,6 +172,8 @@ ChatCompletionClient::AsyncInfer( std::string& serialized_request_body, const std::string& request_id, const Headers& headers) { + + if (callback == nullptr) { return Error( "Callback function must be provided along with AsyncInfer() call."); @@ -172,9 +184,14 @@ ChatCompletionClient::AsyncInfer( request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - if (!request->is_stream_) { - request->SendResponse(true /* is_final */, false /* is_null */); - } + + // Updated to be ok to call multiple times + // will only send the first final response + // + // if (!request->is_stream_) { + // + request->SendResponse(true /* is_final */, false /* is_null */); + // } }; std::unique_ptr request(new ChatCompletionRequest( std::move(completion_callback), std::move(callback), request_id, @@ -185,7 +202,7 @@ ChatCompletionClient::AsyncInfer( request->AddInput( reinterpret_cast(serialized_request_body.data()), serialized_request_body.size()); - + CURL* multi_easy_handle = curl_easy_init(); Error err = PreRunProcessing(multi_easy_handle, raw_request, headers); if (!err.IsOk()) { @@ -226,7 +243,7 @@ ChatCompletionClient::PreRunProcessing( // response data handled by ResponseHandler() curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, request); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, request); const curl_off_t post_byte_size = request->total_input_byte_size_; curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size); diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h index aadcb3252..1da564d47 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h @@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest { // The timers for infer request. triton::client::RequestTimers timer_; const std::string request_id_; + bool final_response_sent_{false}; }; class ChatCompletionClient : public HttpClient { @@ -172,7 +173,7 @@ class ChatCompletionClient : public HttpClient { void* contents, size_t size, size_t nmemb, void* userp); static size_t ResponseHeaderHandler( void* contents, size_t size, size_t nmemb, void* userp); - + Error UpdateInferStat(const triton::client::RequestTimers& timer); InferStat infer_stat_; }; diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 98792df4c..0254e375d 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -41,6 +41,7 @@ class PromptSource(Enum): class OutputFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() + TRITON_GENERATE = auto() TENSORRTLLM = auto() VLLM = auto() @@ -364,7 +365,18 @@ def _convert_generic_json_to_output_format( model_name: list = [], model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, ) -> Dict: - if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS: + if output_format == OutputFormat.TRITON_GENERATE: + output_json = cls._convert_generic_json_to_generate_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + elif output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS: output_json = cls._convert_generic_json_to_openai_chat_completions_format( generic_dataset, add_model_name, @@ -454,6 +466,43 @@ def _convert_generic_json_to_openai_chat_completions_format( return pa_json + @classmethod + def _convert_generic_json_to_generate_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + + + pa_json = cls._populate_triton_generate_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + + @classmethod def _convert_generic_json_to_openai_completions_format( cls, @@ -652,6 +701,59 @@ def _populate_openai_chat_completions_output_json( ) return pa_json + + @classmethod + def _populate_triton_generate_output_json( + cls, + dataset: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + number_of_rows = len(dataset["rows"]) + pa_json = cls._create_empty_trtllm_pa_json() + + default_max_tokens = ( + "max_tokens" not in extra_inputs + or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN + ) + + pa_json = {"data":[{"payload":[{}]} for _ in dataset["rows"]]} + + for index, entry in enumerate(dataset["rows"]): + + for header, content in entry.items(): + new_text_input = cls._create_new_text_input( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) + pa_json["data"][index]["payload"][0]["text_input"] = new_text_input + + pa_json = cls._add_optional_tags_to_openai_json( + pa_json, + index, + False, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + + @classmethod def _populate_openai_completions_output_json( diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index 6b1b9e2bd..a8df3e71a 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -46,6 +46,7 @@ class ResponseFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() TRITON = auto() + TRITON_GENERATE = auto() class Metrics: @@ -446,6 +447,8 @@ def _get_profile_metadata(self, data: dict) -> None: self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS elif data["endpoint"] == "v1/completions": self._response_format = ResponseFormat.OPENAI_COMPLETIONS + elif "generate" in data["endpoint"]: + self._response_format = ResponseFormat.TRITON_GENERATE else: # TPA-66: add PA metadata to handle this case # When endpoint field is either empty or custom endpoint, fall @@ -662,6 +665,8 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: input_text = payload["messages"][0]["content"] elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: input_text = payload["prompt"] + elif self._response_format == ResponseFormat.TRITON_GENERATE: + input_text = payload["text_input"] else: raise ValueError( "Failed to parse OpenAI request input in profile export file." @@ -689,7 +694,10 @@ def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]] """Tokenize the OpenAI response output texts.""" output_texts = [] for output in res_outputs: - text = self._extract_openai_text_output(output["response"]) + if self._response_format == ResponseFormat.TRITON_GENERATE: + text = self._extract_generate_text_output(output["response"]) + else: + text = self._extract_openai_text_output(output["response"]) output_texts.append(text) return self._run_tokenizer(output_texts) @@ -702,6 +710,16 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]: encodings = self._tokenizer(output_texts) return [out[1:] for out in encodings.data["input_ids"]] + def _extract_generate_text_output(self, response: str) -> str: + + response = remove_sse_prefix(response) + + if response == "": + return response + + data = json.loads(response) + return data["text_output"] + def _extract_openai_text_output(self, response: str) -> str: """Extracts text/content of the OpenAI response object.""" response = remove_sse_prefix(response) @@ -731,7 +749,10 @@ def _extract_openai_text_output(self, response: str) -> str: def _is_openai_empty_response(self, response: str) -> bool: """Returns true if the response is an openai response with no content (or empty content)""" - text = self._extract_openai_text_output(response) + if self._response_format == ResponseFormat.TRITON_GENERATE: + text = self._extract_generate_text_output(response) + else: + text = self._extract_openai_text_output(response) if text: return False return True diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index ee886daf3..7f4055530 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) -_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"} +_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions", "generate":"v2/models/{MODEL_NAME}/generate"} def _check_model_args( @@ -109,11 +109,13 @@ def _check_conditional_args( args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS elif args.endpoint_type == "completions": args.output_format = OutputFormat.OPENAI_COMPLETIONS + elif args.endpoint_type == "generate": + args.output_format = OutputFormat.TRITON_GENERATE if args.endpoint is not None: args.endpoint = args.endpoint.lstrip(" /") else: - args.endpoint = _endpoint_type_map[args.endpoint_type] + args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model) elif args.endpoint_type is not None: parser.error( "The --endpoint-type option should only be used when using the 'openai' service-kind." @@ -400,7 +402,7 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--endpoint-type", type=str, - choices=["chat", "completions"], + choices=["chat", "completions", "generate"], required=False, help=f"The endpoint-type to send requests to on the " 'server. This is only used with the "openai" service-kind.', From f1b46468c4b18a581b29365948b014c860e4a85b Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Sat, 25 May 2024 09:41:01 -0700 Subject: [PATCH 04/15] removing service type - infer from endpoint type --- .../genai-perf/genai_perf/parser.py | 59 +++++++------------ 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 7f4055530..f92e88250 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -98,33 +98,25 @@ def _check_conditional_args( Check for conditional args and raise an error if they are not set. """ - # Endpoint and output format checks - if args.service_kind == "openai": - if args.endpoint_type is None: - parser.error( - "The --endpoint-type option is required when using the 'openai' service-kind." - ) - else: - if args.endpoint_type == "chat": - args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS - elif args.endpoint_type == "completions": - args.output_format = OutputFormat.OPENAI_COMPLETIONS - elif args.endpoint_type == "generate": - args.output_format = OutputFormat.TRITON_GENERATE - - if args.endpoint is not None: - args.endpoint = args.endpoint.lstrip(" /") - else: - args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model) - elif args.endpoint_type is not None: - parser.error( - "The --endpoint-type option should only be used when using the 'openai' service-kind." - ) - - if args.service_kind == "triton": + if args.endpoint_type == "chat": + args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS + args.service_kind = "openai" + elif args.endpoint_type == "completions": + args.output_format = OutputFormat.OPENAI_COMPLETIONS + args.service_kind = "openai" + elif args.endpoint_type == "generate": + args.output_format = OutputFormat.TRITON_GENERATE + args.service_kind = "openai" + elif args.endpoint_type == "kserve": + args.service_kind = "triton" args = _convert_str_to_enum_entry(args, "backend", OutputFormat) args.output_format = args.backend + if args.endpoint is not None: + args.endpoint = args.endpoint.lstrip(" /") + else: + args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model) + # Output token distribution checks if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN: if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV: @@ -402,23 +394,12 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--endpoint-type", type=str, - choices=["chat", "completions", "generate"], + choices=["chat", "completions", "generate", "kserve"], + default="kserve", required=False, - help=f"The endpoint-type to send requests to on the " - 'server. This is only used with the "openai" service-kind.', + help=f"The endpoint-type for requests. Inputs will be formatted according to endpoint-type.", ) - - endpoint_group.add_argument( - "--service-kind", - type=str, - choices=["triton", "openai"], - default="triton", - required=False, - help="The kind of service perf_analyzer will " - 'generate load for. In order to use "openai", ' - "you must specify an api via --endpoint-type.", - ) - + endpoint_group.add_argument( "--streaming", action="store_true", From 56e662c41eb9aea60591eb78128d13a7d939b010 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 16:33:13 -0700 Subject: [PATCH 05/15] updated to remove unused variables --- .../genai-perf/genai_perf/llm_inputs/llm_inputs.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 0254e375d..b2ac8bdeb 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -717,18 +717,10 @@ def _populate_triton_generate_output_json( output_tokens_deterministic: bool, model_name: str = "", ) -> Dict: - number_of_rows = len(dataset["rows"]) - pa_json = cls._create_empty_trtllm_pa_json() - - default_max_tokens = ( - "max_tokens" not in extra_inputs - or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN - ) pa_json = {"data":[{"payload":[{}]} for _ in dataset["rows"]]} - for index, entry in enumerate(dataset["rows"]): - + for index, entry in enumerate(dataset["rows"]): for header, content in entry.items(): new_text_input = cls._create_new_text_input( header, @@ -738,7 +730,7 @@ def _populate_triton_generate_output_json( content, ) pa_json["data"][index]["payload"][0]["text_input"] = new_text_input - + pa_json = cls._add_optional_tags_to_openai_json( pa_json, index, From 5503c85987e66a98e5f984b494855b8238071bc3 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 16:35:32 -0700 Subject: [PATCH 06/15] update with precommit run --- .../client_backend/openai/openai_client.cc | 13 ++--- .../client_backend/openai/openai_client.h | 2 +- .../genai_perf/llm_inputs/llm_inputs.py | 54 +++++++++---------- .../genai-perf/genai_perf/llm_metrics.py | 1 - .../genai-perf/genai_perf/parser.py | 14 +++-- 5 files changed, 40 insertions(+), 44 deletions(-) diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index 7f503e947..31e359dda 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -115,7 +115,7 @@ ChatCompletionClient::ResponseHeaderHandler( hdr.find("text/event-stream") != std::string::npos) { request->is_stream_ = true; } - + return byte_size; } @@ -123,7 +123,6 @@ size_t ChatCompletionClient::ResponseHandler( void* contents, size_t size, size_t nmemb, void* userp) { - // [TODO TMA-1666] verify if the SSE responses received are complete, or the // response need to be stitched first. To verify, print out the received // responses from SendResponse() to make sure the OpenAI server doesn't chunk @@ -161,7 +160,7 @@ ChatCompletionClient::ResponseHandler( // RECV_END so that we always have the time of the last. request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::RECV_END); - + return result_bytes; } @@ -172,8 +171,6 @@ ChatCompletionClient::AsyncInfer( std::string& serialized_request_body, const std::string& request_id, const Headers& headers) { - - if (callback == nullptr) { return Error( "Callback function must be provided along with AsyncInfer() call."); @@ -189,7 +186,7 @@ ChatCompletionClient::AsyncInfer( // will only send the first final response // // if (!request->is_stream_) { - // + // request->SendResponse(true /* is_final */, false /* is_null */); // } }; @@ -202,7 +199,7 @@ ChatCompletionClient::AsyncInfer( request->AddInput( reinterpret_cast(serialized_request_body.data()), serialized_request_body.size()); - + CURL* multi_easy_handle = curl_easy_init(); Error err = PreRunProcessing(multi_easy_handle, raw_request, headers); if (!err.IsOk()) { @@ -243,7 +240,7 @@ ChatCompletionClient::PreRunProcessing( // response data handled by ResponseHandler() curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, request); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, request); const curl_off_t post_byte_size = request->total_input_byte_size_; curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size); diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h index 1da564d47..e63728fc4 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h @@ -173,7 +173,7 @@ class ChatCompletionClient : public HttpClient { void* contents, size_t size, size_t nmemb, void* userp); static size_t ResponseHeaderHandler( void* contents, size_t size, size_t nmemb, void* userp); - + Error UpdateInferStat(const triton::client::RequestTimers& timer); InferStat infer_stat_; }; diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index b2ac8bdeb..c3e9353e6 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -468,24 +468,22 @@ def _convert_generic_json_to_openai_chat_completions_format( @classmethod def _convert_generic_json_to_generate_format( - cls, - dataset_json: Dict, - add_model_name: bool, - add_stream: bool, - extra_inputs: Dict, - output_tokens_mean: int, - output_tokens_stddev: int, - output_tokens_deterministic: bool, - model_name: str = "", + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", ) -> Dict: - ( system_role_headers, user_role_headers, text_input_headers, ) = cls._determine_json_feature_roles(dataset_json) - pa_json = cls._populate_triton_generate_output_json( dataset_json, system_role_headers, @@ -502,7 +500,6 @@ def _convert_generic_json_to_generate_format( return pa_json - @classmethod def _convert_generic_json_to_openai_completions_format( cls, @@ -701,26 +698,25 @@ def _populate_openai_chat_completions_output_json( ) return pa_json - + @classmethod def _populate_triton_generate_output_json( - cls, - dataset: Dict, - system_role_headers: List[str], - user_role_headers: List[str], - text_input_headers: List[str], - add_model_name: bool, - add_stream: bool, - extra_inputs: Dict, - output_tokens_mean: int, - output_tokens_stddev: int, - output_tokens_deterministic: bool, - model_name: str = "", + cls, + dataset: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", ) -> Dict: + pa_json: dict = {"data": [{"payload": [{}]} for _ in dataset["rows"]]} - pa_json = {"data":[{"payload":[{}]} for _ in dataset["rows"]]} - - for index, entry in enumerate(dataset["rows"]): + for index, entry in enumerate(dataset["rows"]): for header, content in entry.items(): new_text_input = cls._create_new_text_input( header, @@ -745,8 +741,6 @@ def _populate_triton_generate_output_json( return pa_json - - @classmethod def _populate_openai_completions_output_json( cls, diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index a8df3e71a..14b250735 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -711,7 +711,6 @@ def _run_tokenizer(self, output_texts: List[str]) -> List[List[int]]: return [out[1:] for out in encodings.data["input_ids"]] def _extract_generate_text_output(self, response: str) -> str: - response = remove_sse_prefix(response) if response == "": diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index f92e88250..cebb7cc97 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -51,7 +51,11 @@ logger = logging.getLogger(__name__) -_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions", "generate":"v2/models/{MODEL_NAME}/generate"} +_endpoint_type_map = { + "chat": "v1/chat/completions", + "completions": "v1/completions", + "generate": "v2/models/{MODEL_NAME}/generate", +} def _check_model_args( @@ -115,8 +119,10 @@ def _check_conditional_args( if args.endpoint is not None: args.endpoint = args.endpoint.lstrip(" /") else: - args.endpoint = _endpoint_type_map[args.endpoint_type].format(MODEL_NAME=args.model) - + args.endpoint = _endpoint_type_map[args.endpoint_type].format( + MODEL_NAME=args.model + ) + # Output token distribution checks if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN: if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV: @@ -399,7 +405,7 @@ def _add_endpoint_args(parser): required=False, help=f"The endpoint-type for requests. Inputs will be formatted according to endpoint-type.", ) - + endpoint_group.add_argument( "--streaming", action="store_true", From bfc1c7aedb55cd70b166bb484c65944a143a0868 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 17:56:21 -0700 Subject: [PATCH 07/15] updating tests --- .../genai-perf/genai_perf/parser.py | 5 +- .../genai-perf/tests/test_cli.py | 56 +++++-------------- 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index cebb7cc97..8b9a32838 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -55,6 +55,7 @@ "chat": "v1/chat/completions", "completions": "v1/completions", "generate": "v2/models/{MODEL_NAME}/generate", + "kserve": "v2/models/{MODEL_NAME}/infer", } @@ -380,10 +381,10 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--backend", type=str, - choices=utils.get_enum_names(OutputFormat)[2:], + choices=["tensorrtllm", "vllm"], default="tensorrtllm", required=False, - help=f'When using the "triton" service-kind, ' + help=f'When using the "kserve" endpoint type, ' "this is the backend of the model. " "For the TENSORRT-LLM backend, you currently must set " "'exclude_input_in_output' to true in the model config to " diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index 3066d554a..d7f0139dc 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -80,19 +80,17 @@ def test_help_version_arguments_output_and_exit( ), (["--concurrency", "3"], {"concurrency": 3}), ( - ["--endpoint-type", "completions", "--service-kind", "openai"], + ["--endpoint-type", "completions"], {"endpoint": "v1/completions"}, ), ( - ["--endpoint-type", "chat", "--service-kind", "openai"], + ["--endpoint-type", "chat"], {"endpoint": "v1/chat/completions"}, ), ( [ "--endpoint-type", "chat", - "--service-kind", - "openai", "--endpoint", "custom/address", ], @@ -102,8 +100,6 @@ def test_help_version_arguments_output_and_exit( [ "--endpoint-type", "chat", - "--service-kind", - "openai", "--endpoint", " /custom/address", ], @@ -113,8 +109,6 @@ def test_help_version_arguments_output_and_exit( [ "--endpoint-type", "completions", - "--service-kind", - "openai", "--endpoint", "custom/address", ], @@ -164,9 +158,9 @@ def test_help_version_arguments_output_and_exit( (["--random-seed", "8"], {"random_seed": 8}), (["--request-rate", "9.0"], {"request_rate": 9.0}), (["-s", "99.5"], {"stability_percentage": 99.5}), - (["--service-kind", "triton"], {"service_kind": "triton"}), + (["--endpoint-type", "kserve"], {"service_kind": "triton"}), ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], {"service_kind": "openai", "endpoint": "v1/chat/completions"}, ), (["--stability-percentage", "99.5"], {"stability_percentage": 99.5}), @@ -263,25 +257,25 @@ def test_file_flags_parsed(self, monkeypatch, mocker): "arg, expected_path", [ ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], "artifacts/test_model-openai-chat-concurrency1", ), ( - ["--service-kind", "openai", "--endpoint-type", "completions"], + ["--endpoint-type", "completions"], "artifacts/test_model-openai-completions-concurrency1", ), ( - ["--service-kind", "triton", "--backend", "tensorrtllm"], + ["--endpoint-type", "kserve", "--backend", "tensorrtllm"], "artifacts/test_model-triton-tensorrtllm-concurrency1", ), ( - ["--service-kind", "triton", "--backend", "vllm"], + ["--endpoint-type", "kserve", "--backend", "vllm"], "artifacts/test_model-triton-vllm-concurrency1", ), ( [ - "--service-kind", - "triton", + "--endpoint-type", + "kserve", "--backend", "vllm", "--concurrency", @@ -318,8 +312,6 @@ def test_default_profile_export_filepath( [ "--model", "hello/world/test_model", - "--service-kind", - "openai", "--endpoint-type", "chat", ], @@ -407,22 +399,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys): @pytest.mark.parametrize( "args, expected_output", [ - ( - ["genai-perf", "-m", "test_model", "--service-kind", "openai"], - "The --endpoint-type option is required when using the 'openai' service-kind.", - ), - ( - [ - "genai-perf", - "-m", - "test_model", - "--service-kind", - "openai", - "--endpoint", - "custom/address", - ], - "The --endpoint-type option is required when using the 'openai' service-kind.", - ), ( ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"], "The --output-tokens-mean option is required when using --output-tokens-stddev.", @@ -450,8 +426,6 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "genai-perf", "-m", "test_model", - "--service-kind", - "openai", "--endpoint-type", "chat", "--output-tokens-mean", @@ -476,17 +450,15 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): "args, expected_format", [ ( - ["--service-kind", "openai", "--endpoint-type", "chat"], + ["--endpoint-type", "chat"], OutputFormat.OPENAI_CHAT_COMPLETIONS, ), ( - ["--service-kind", "openai", "--endpoint-type", "completions"], + ["--endpoint-type", "completions"], OutputFormat.OPENAI_COMPLETIONS, ), ( [ - "--service-kind", - "openai", "--endpoint-type", "completions", "--endpoint", @@ -495,10 +467,10 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): OutputFormat.OPENAI_COMPLETIONS, ), ( - ["--service-kind", "triton", "--backend", "tensorrtllm"], + ["--endpoint-type", "kserve", "--backend", "tensorrtllm"], OutputFormat.TENSORRTLLM, ), - (["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM), + (["--endpoint-type", "kserve", "--backend", "vllm"], OutputFormat.VLLM), ], ) def test_inferred_output_format(self, monkeypatch, args, expected_format): From d576d256ef79bb23515fb353a57c50d9a787577a Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 18:24:45 -0700 Subject: [PATCH 08/15] updating tests --- .../genai-perf/genai_perf/llm_inputs/llm_inputs.py | 1 - src/c++/perf_analyzer/genai-perf/genai_perf/parser.py | 4 ++-- src/c++/perf_analyzer/genai-perf/tests/test_cli.py | 2 +- src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py | 8 +++----- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index c3e9353e6..3681c51fc 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -232,7 +232,6 @@ def _get_input_dataset_from_url( url = cls._resolve_url(dataset_name) configured_url = cls._create_configured_url(url, starting_index, length) dataset = cls._download_dataset(configured_url) - return dataset @classmethod diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 8b9a32838..b3d0837ca 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -138,7 +138,7 @@ def _check_conditional_args( if args.service_kind != "triton": if args.output_tokens_mean_deterministic: parser.error( - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind." + "The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type." ) return args @@ -273,7 +273,7 @@ def _add_input_args(parser): help=f"When using --output-tokens-mean, this flag can be set to " "improve precision by setting the minimum number of tokens " "equal to the requested number of tokens. This is currently " - "supported with the Triton service-kind. " + "supported with the kserve endpoint type. " "Note that there is still some variability in the requested number " "of output tokens, but GenAi-Perf attempts its best effort with your " "model to get the right number of output tokens. ", diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index d7f0139dc..15050184c 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -432,7 +432,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "100", "--output-tokens-mean-deterministic", ], - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind", + "The --output-tokens-mean-deterministic option is only supported with the kserve endpoint type", ), ], ) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py index 184a47f11..8a33853ec 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_wrapper.py @@ -43,7 +43,7 @@ class TestWrapper: ], ) def test_url_exactly_once_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -70,7 +70,7 @@ def test_url_exactly_once_triton(self, monkeypatch, arg): ], ) def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -87,7 +87,7 @@ def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): ], ) def test_service_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = ["genai-perf", "-m", "test_model", "--endpoint-type", "kserve"] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -113,8 +113,6 @@ def test_service_openai(self, monkeypatch, arg): "genai-perf", "-m", "test_model", - "--service-kind", - "openai", ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() From 1f50b6e4b9a491d9ea06f7bb34633e4f0ebd066b Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 20:58:15 -0700 Subject: [PATCH 09/15] updated test to mark as potential failure --- .../genai-perf/tests/test_llm_inputs.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index 4486ba3d9..86834f003 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -189,6 +189,119 @@ def test_llm_inputs_with_defaults(self, default_configured_url): dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( dataset=dataset ) + pa_json = LlmInputs._convert_generic_json_to_output_format( + output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + generic_dataset=dataset_json, + add_model_name=False, + add_stream=False, + extra_inputs={}, + output_tokens_mean=LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN, + output_tokens_stddev=LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV, + output_tokens_deterministic=False, + ) + + assert pa_json is not None + assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + @pytest.mark.xfail( + reason="Download from huggingface may fail due to server issues", + raises=GenAIPerfException, + ) + def test_create_openai_llm_inputs_cnn_dailymail(self): + """ + Test CNN_DAILYMAIL can be accessed + """ + pa_json = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + dataset_name=CNN_DAILY_MAIL, + output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + ) + + os.remove(DEFAULT_INPUT_DATA_JSON) + + assert pa_json is not None + assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + def test_write_to_file(self): + """ + Test that write to file is working correctly + """ + pa_json = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + dataset_name=OPEN_ORCA, + output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + model_name="open_orca", + add_model_name=True, + add_stream=True, + ) + try: + with open(DEFAULT_INPUT_DATA_JSON, "r") as f: + json_str = f.read() + finally: + os.remove(DEFAULT_INPUT_DATA_JSON) + + assert pa_json == json.loads(json_str) + + def test_create_openai_to_vllm(self): + """ + Test conversion of openai to vllm + """ + pa_json = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + output_format=OutputFormat.VLLM, + dataset_name=OPEN_ORCA, + add_model_name=False, + add_stream=True, + ) + + os.remove(DEFAULT_INPUT_DATA_JSON) + + assert pa_json is not None + assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + def test_create_openai_to_completions(self): + """ + Test conversion of openai to completions + """ + pa_json = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + output_format=OutputFormat.OPENAI_COMPLETIONS, + dataset_name=OPEN_ORCA, + add_model_name=False, + add_stream=True, + ) + + os.remove(DEFAULT_INPUT_DATA_JSON) + + assert pa_json is not None + assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + # NIM legacy completion endpoint only supports string and not + # array of strings. Verify that the prompt is of type string + # not list + assert isinstance(pa_json["data"][0]["payload"][0]["prompt"], str) + + def test_create_openai_to_trtllm(self): + """ + Test conversion of openai to trtllm + """ + pa_json = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + output_format=OutputFormat.TENSORRTLLM, + dataset_name=OPEN_ORCA, + add_model_name=False, + add_stream=True, + ) + + os.remove(DEFAULT_INPUT_DATA_JSON) + + assert pa_json is not None + assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + def test_random_synthetic_no_stddev(self, default_tokenizer): + """ + Test that we can produce an exact number of random synthetic tokens + """ + random.seed(1) assert dataset_json is not None assert len(dataset_json["rows"]) == TEST_LENGTH From 8dcc53dd3220703b820efc8795bc0c642cb9ad2d Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 21:12:10 -0700 Subject: [PATCH 10/15] removing --- src/c++/perf_analyzer/client_backend/openai/openai_client.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index 31e359dda..05eaee26c 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -115,7 +115,6 @@ ChatCompletionClient::ResponseHeaderHandler( hdr.find("text/event-stream") != std::string::npos) { request->is_stream_ = true; } - return byte_size; } From 7f740a9b6c89c45635041732d8dfe4de5f3fdc2b Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 21:24:34 -0700 Subject: [PATCH 11/15] updating comment --- .../perf_analyzer/client_backend/openai/openai_client.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index 05eaee26c..e835f988e 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -181,13 +181,10 @@ ChatCompletionClient::AsyncInfer( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - // Updated to be ok to call multiple times - // will only send the first final response - // - // if (!request->is_stream_) { - // + // Send Response checks if a final + // response has already been sent + // (in the case of seeing [DONE] in streaming case) request->SendResponse(true /* is_final */, false /* is_null */); - // } }; std::unique_ptr request(new ChatCompletionRequest( std::move(completion_callback), std::move(callback), request_id, From 73ffa08192bb2b57b00d8ceb9785d8c6171b531f Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 21:41:01 -0700 Subject: [PATCH 12/15] undo changes --- .../genai-perf/tests/test_llm_inputs.py | 112 ------------------ 1 file changed, 112 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index 86834f003..d5ffcbfba 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -189,118 +189,6 @@ def test_llm_inputs_with_defaults(self, default_configured_url): dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( dataset=dataset ) - pa_json = LlmInputs._convert_generic_json_to_output_format( - output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, - generic_dataset=dataset_json, - add_model_name=False, - add_stream=False, - extra_inputs={}, - output_tokens_mean=LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN, - output_tokens_stddev=LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV, - output_tokens_deterministic=False, - ) - - assert pa_json is not None - assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH - - @pytest.mark.xfail( - reason="Download from huggingface may fail due to server issues", - raises=GenAIPerfException, - ) - def test_create_openai_llm_inputs_cnn_dailymail(self): - """ - Test CNN_DAILYMAIL can be accessed - """ - pa_json = LlmInputs.create_llm_inputs( - input_type=PromptSource.DATASET, - dataset_name=CNN_DAILY_MAIL, - output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, - ) - - os.remove(DEFAULT_INPUT_DATA_JSON) - - assert pa_json is not None - assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH - - def test_write_to_file(self): - """ - Test that write to file is working correctly - """ - pa_json = LlmInputs.create_llm_inputs( - input_type=PromptSource.DATASET, - dataset_name=OPEN_ORCA, - output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, - model_name="open_orca", - add_model_name=True, - add_stream=True, - ) - try: - with open(DEFAULT_INPUT_DATA_JSON, "r") as f: - json_str = f.read() - finally: - os.remove(DEFAULT_INPUT_DATA_JSON) - - assert pa_json == json.loads(json_str) - - def test_create_openai_to_vllm(self): - """ - Test conversion of openai to vllm - """ - pa_json = LlmInputs.create_llm_inputs( - input_type=PromptSource.DATASET, - output_format=OutputFormat.VLLM, - dataset_name=OPEN_ORCA, - add_model_name=False, - add_stream=True, - ) - - os.remove(DEFAULT_INPUT_DATA_JSON) - - assert pa_json is not None - assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH - - def test_create_openai_to_completions(self): - """ - Test conversion of openai to completions - """ - pa_json = LlmInputs.create_llm_inputs( - input_type=PromptSource.DATASET, - output_format=OutputFormat.OPENAI_COMPLETIONS, - dataset_name=OPEN_ORCA, - add_model_name=False, - add_stream=True, - ) - - os.remove(DEFAULT_INPUT_DATA_JSON) - - assert pa_json is not None - assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH - # NIM legacy completion endpoint only supports string and not - # array of strings. Verify that the prompt is of type string - # not list - assert isinstance(pa_json["data"][0]["payload"][0]["prompt"], str) - - def test_create_openai_to_trtllm(self): - """ - Test conversion of openai to trtllm - """ - pa_json = LlmInputs.create_llm_inputs( - input_type=PromptSource.DATASET, - output_format=OutputFormat.TENSORRTLLM, - dataset_name=OPEN_ORCA, - add_model_name=False, - add_stream=True, - ) - - os.remove(DEFAULT_INPUT_DATA_JSON) - - assert pa_json is not None - assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH - - def test_random_synthetic_no_stddev(self, default_tokenizer): - """ - Test that we can produce an exact number of random synthetic tokens - """ random.seed(1) assert dataset_json is not None From 056c099087bf5aaa296936dc66afb93d017e68c0 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 21:42:05 -0700 Subject: [PATCH 13/15] removing unneeded change --- src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index d5ffcbfba..4486ba3d9 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -189,7 +189,6 @@ def test_llm_inputs_with_defaults(self, default_configured_url): dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( dataset=dataset ) - random.seed(1) assert dataset_json is not None assert len(dataset_json["rows"]) == TEST_LENGTH From e7b3e533636058f23979a3350be40dccdf9dc8f7 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 22:10:02 -0700 Subject: [PATCH 14/15] updates for new model selection --- .../genai-perf/genai_perf/llm_inputs/llm_inputs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 3681c51fc..a531d2ad5 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -475,7 +475,8 @@ def _convert_generic_json_to_generate_format( output_tokens_mean: int, output_tokens_stddev: int, output_tokens_deterministic: bool, - model_name: str = "", + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, ) -> Dict: ( system_role_headers, @@ -711,7 +712,8 @@ def _populate_triton_generate_output_json( output_tokens_mean: int, output_tokens_stddev: int, output_tokens_deterministic: bool, - model_name: str = "", + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, ) -> Dict: pa_json: dict = {"data": [{"payload": [{}]} for _ in dataset["rows"]]} @@ -726,6 +728,9 @@ def _populate_triton_generate_output_json( ) pa_json["data"][index]["payload"][0]["text_input"] = new_text_input + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) pa_json = cls._add_optional_tags_to_openai_json( pa_json, index, @@ -735,7 +740,7 @@ def _populate_triton_generate_output_json( output_tokens_mean, output_tokens_stddev, output_tokens_deterministic, - model_name, + iter_model_name, ) return pa_json From 0c7f5a10edebba3a2eb0634fe3d9b49b192f39e9 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 4 Jun 2024 22:26:28 -0700 Subject: [PATCH 15/15] updating expected values --- src/c++/perf_analyzer/genai-perf/genai_perf/parser.py | 6 +++++- .../perf_analyzer/genai-perf/tests/test_json_exporter.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index b3d0837ca..5416ee331 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -120,8 +120,12 @@ def _check_conditional_args( if args.endpoint is not None: args.endpoint = args.endpoint.lstrip(" /") else: + if args.model: + model_name = args.model[0] + else: + model_name = "" args.endpoint = _endpoint_type_map[args.endpoint_type].format( - MODEL_NAME=args.model + MODEL_NAME=model_name ) # Output token distribution checks diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py index b97712e31..7bb76ee5e 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py @@ -210,8 +210,8 @@ class TestJsonExporter: "formatted_model_name": "gpt2_vllm", "model_selection_strategy": "round_robin", "backend": "vllm", - "endpoint": null, - "endpoint_type": null, + "endpoint": "v2/models/gpt2_vllm/infer", + "endpoint_type": "kserve", "service_kind": "triton", "streaming": true, "u": null,