From c15288ab9c3693b0c1555d80cf6f238e0d44eed9 Mon Sep 17 00:00:00 2001 From: Katherine Yang Date: Tue, 30 Jan 2024 19:44:48 -0800 Subject: [PATCH] temp commit for feel for llm detection --- src/c++/perf_analyzer/command_line_parser.cc | 8 ++++ src/c++/perf_analyzer/command_line_parser.h | 3 ++ src/c++/perf_analyzer/model_parser.cc | 21 ++++++++++ src/c++/perf_analyzer/model_parser.h | 3 ++ src/c++/perf_analyzer/perf_analyzer.cc | 42 ++++++++++++++++++++ 5 files changed, 77 insertions(+) diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc index 711f1714e..7a2c9885f 100644 --- a/src/c++/perf_analyzer/command_line_parser.cc +++ b/src/c++/perf_analyzer/command_line_parser.cc @@ -188,6 +188,9 @@ CLParser::Usage(const std::string& msg) std::cerr << "\t--collect-metrics" << std::endl; std::cerr << "\t--metrics-url" << std::endl; std::cerr << "\t--metrics-interval" << std::endl; + std::cerr << "\t--is-llm-model enforces llm style dummy inputs and generate " + "LLM specific metrics" + << std::endl; std::cerr << std::endl; std::cerr << "==== OPTIONS ==== \n \n"; @@ -875,6 +878,7 @@ CLParser::ParseCommandLine(int argc, char** argv) {"periodic-concurrency-range", required_argument, 0, 59}, {"request-period", required_argument, 0, 60}, {"request-parameter", required_argument, 0, 61}, + {"is-llm-model", no_argument, 0, 62}, {0, 0, 0, 0}}; // Parse commandline... @@ -1608,6 +1612,10 @@ CLParser::ParseCommandLine(int argc, char** argv) params_->request_parameters[name] = param; break; } + case 62: { + params_->is_llm = true; + break; + } case 'v': params_->extra_verbose = params_->verbose; params_->verbose = true; diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h index 9ff4869ff..077e3e783 100644 --- a/src/c++/perf_analyzer/command_line_parser.h +++ b/src/c++/perf_analyzer/command_line_parser.h @@ -125,6 +125,9 @@ struct PerfAnalyzerParameters { uint64_t metrics_interval_ms{1000}; bool metrics_interval_ms_specified{false}; + // Model is LLM. Will determine if PA outputs LLM related metrics + bool is_llm{false}; + // Return true if targeting concurrency // bool targeting_concurrency() const diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc index ee7ab5303..9721b5ae2 100644 --- a/src/c++/perf_analyzer/model_parser.cc +++ b/src/c++/perf_analyzer/model_parser.cc @@ -169,6 +169,16 @@ ModelParser::InitTriton( response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } + // Check what the backend is: + const auto backend_config_itr = config.FindMember("backend"); + if (backend_config_itr != config.MemberEnd()) { + std::string backend_str; + RETURN_IF_ERROR(GetString(backend_itr->value, &backend_str)); + if (backend_str == "vllm") { + backend_type_ = TritonBackendType::VLLM; + } + } + return cb::Error::Success; } @@ -434,4 +444,15 @@ ModelParser::GetInt(const rapidjson::Value& value, int64_t* integer_value) return cb::Error::Success; } +cb::Error +ModelParser::GetString(const rapidjson::Value& value, std::string* string_value) +{ + if (value.IsString()) { + std::string str(value.GetString(), value.GetStringLength()); + string_value = &str; + return cb::Error::Success; + } + return cb::Error("Value is not a string", pa::GENERIC_ERROR); +} + }} // namespace triton::perfanalyzer diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h index 4646433ab..7b043ae2d 100644 --- a/src/c++/perf_analyzer/model_parser.h +++ b/src/c++/perf_analyzer/model_parser.h @@ -67,6 +67,8 @@ class ModelParser { ENSEMBLE_SEQUENCE }; + enum TritonBackendType { TENSORRT_LLM, VLLM, OTHER }; + explicit ModelParser(cb::BackendKind backend_kind) : backend_kind_(backend_kind), inputs_(std::make_shared()), @@ -214,6 +216,7 @@ class ModelParser { std::string model_name_; std::string model_version_; std::string model_signature_name_; + TritonBackendType backend_type_ = TritonBackendType::OTHER; size_t max_batch_size_; bool response_cache_enabled_; diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc index 46b665757..193083818 100644 --- a/src/c++/perf_analyzer/perf_analyzer.cc +++ b/src/c++/perf_analyzer/perf_analyzer.cc @@ -52,6 +52,44 @@ SignalHandler(int signum) exit(0); } } + +bool +IsLLMModel( + const std::shared_ptr& parser, + const pa::PAParamsPTR& params) +{ + bool is_llm_from_user = params->is_llm if (is_llm_from_user) + { + return true; + } + + bool is_llm = false; + // check if its decoupled + is_llm = + is_llm || (parser->IsDecoupled() && !params->profile_export_file.empty()); + + // check if is ensemble model, and if model has a tensorrt_llm portion to it + // then it is for sure the tensorrt-llm backend + if (!parser->composing_models_map_.empty()) { + auto composing_models_map = parser->composing_models_map_; + for (auto& [_, model_version_pair] : *composing_models_map) { + std::string model_version = model_version_pair.first; + if (model_version == "tensorrt_llm") { + parser->backend_type == ModelParser::TritonBackendType::TENSORRT_LLM; + break; + } + } + } + + // check if backend used is vLLM or TensorRT-LLM backend + is_llm = is_llm || + (parser->backend_type_ == ModelParser::TritonBackendType::VLLM || + parser->backend_type_ = + ModelParser::TritonBackendType::TENSORRT_LLM); + + return is_llm; +} + }} // namespace triton::perfanalyzer PerfAnalyzer::PerfAnalyzer(pa::PAParamsPtr params) : params_(params) @@ -428,6 +466,10 @@ PerfAnalyzer::WriteReport() bool should_output_metrics{ params_->should_collect_metrics && params_->verbose_csv}; + // TODO (TMA-1526): Detect if the model is LLM and report LLM metrics based + // on that signal. Currently we simply check if it's a decoupled model. + bool should_output_llm_metrics{IsLLMModel(parser_, params_)}; + std::unique_ptr writer; FAIL_IF_ERR(