Skip to content

Commit

Permalink
temp commit for feel for llm detection
Browse files Browse the repository at this point in the history
  • Loading branch information
jbkyang-nvi committed Feb 16, 2024
1 parent 1dc9836 commit 3401b87
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ CLParser::Usage(const std::string& msg)
std::cerr << "\t--collect-metrics" << std::endl;
std::cerr << "\t--metrics-url" << std::endl;
std::cerr << "\t--metrics-interval" << std::endl;
std::cerr << "\t--is-llm-model enforces llm style dummy inputs and generate "
"LLM specific metrics"
<< std::endl;
std::cerr << std::endl;
std::cerr << "==== OPTIONS ==== \n \n";

Expand Down Expand Up @@ -875,6 +878,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"periodic-concurrency-range", required_argument, 0, 59},
{"request-period", required_argument, 0, 60},
{"request-parameter", required_argument, 0, 61},
{"is-llm-model", no_argument, 0, 62},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -1608,6 +1612,10 @@ CLParser::ParseCommandLine(int argc, char** argv)
params_->request_parameters[name] = param;
break;
}
case 62: {
params_->is_llm = true;
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
Expand Down
3 changes: 3 additions & 0 deletions src/c++/perf_analyzer/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ struct PerfAnalyzerParameters {
uint64_t metrics_interval_ms{1000};
bool metrics_interval_ms_specified{false};

// Model is LLM. Will determine if PA outputs LLM related metrics
bool is_llm{false};

// Return true if targeting concurrency
//
bool targeting_concurrency() const
Expand Down
21 changes: 21 additions & 0 deletions src/c++/perf_analyzer/model_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@ ModelParser::InitTriton(
response_cache_enabled_ = cache_itr->value["enable"].GetBool();
}

// Check what the backend is:
const auto backend_config_itr = config.FindMember("backend");
if (backend_config_itr != config.MemberEnd()) {
std::string backend_str;
RETURN_IF_ERROR(GetString(backend_itr->value, &backend_str));
if (backend_str == "vllm") {
backend_type_ = TritonBackendType::VLLM;
}
}

return cb::Error::Success;
}

Expand Down Expand Up @@ -434,4 +444,15 @@ ModelParser::GetInt(const rapidjson::Value& value, int64_t* integer_value)
return cb::Error::Success;
}

cb::Error
ModelParser::GetString(const rapidjson::Value& value, std::string* string_value)
{
if (value.IsString()) {
std::string str(value.GetString(), value.GetStringLength());
string_value = &str;
return cb::Error::Success;
}
return cb::Error("Value is not a string", pa::GENERIC_ERROR);
}

}} // namespace triton::perfanalyzer
3 changes: 3 additions & 0 deletions src/c++/perf_analyzer/model_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ class ModelParser {
ENSEMBLE_SEQUENCE
};

enum TritonBackendType { TENSORRT_LLM, VLLM, OTHER };

explicit ModelParser(cb::BackendKind backend_kind)
: backend_kind_(backend_kind),
inputs_(std::make_shared<ModelTensorMap>()),
Expand Down Expand Up @@ -214,6 +216,7 @@ class ModelParser {
std::string model_name_;
std::string model_version_;
std::string model_signature_name_;
TritonBackendType backend_type_ = TritonBackendType::OTHER;
size_t max_batch_size_;
bool response_cache_enabled_;

Expand Down
42 changes: 42 additions & 0 deletions src/c++/perf_analyzer/perf_analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,44 @@ SignalHandler(int signum)
exit(0);
}
}

bool
IsLLMModel(
const std::shared_ptr<pa::ModelParser>& parser,
const pa::PAParamsPTR& params)
{
bool is_llm_from_user = params->is_llm if (is_llm_from_user)
{
return true;
}

bool is_llm = false;
// check if its decoupled
is_llm =
is_llm || (parser->IsDecoupled() && !params->profile_export_file.empty());

// check if is ensemble model, and if model has a tensorrt_llm portion to it
// then it is for sure the tensorrt-llm backend
if (!parser->composing_models_map_.empty()) {
auto composing_models_map = parser->composing_models_map_;
for (auto& [_, model_version_pair] : *composing_models_map) {
std::string model_version = model_version_pair.first;
if (model_version == "tensorrt_llm") {
parser->backend_type == ModelParser::TritonBackendType::TENSORRT_LLM;
break;
}
}
}

// check if backend used is vLLM or TensorRT-LLM backend
is_llm = is_llm ||
(parser->backend_type_ == ModelParser::TritonBackendType::VLLM ||
parser->backend_type_ =
ModelParser::TritonBackendType::TENSORRT_LLM);

return is_llm;
}

}} // namespace triton::perfanalyzer

PerfAnalyzer::PerfAnalyzer(pa::PAParamsPtr params) : params_(params)
Expand Down Expand Up @@ -428,6 +466,10 @@ PerfAnalyzer::WriteReport()
bool should_output_metrics{
params_->should_collect_metrics && params_->verbose_csv};

// TODO (TMA-1526): Detect if the model is LLM and report LLM metrics based
// on that signal. Currently we simply check if it's a decoupled model.
bool should_output_llm_metrics{IsLLMModel(parser_, params_)};

std::unique_ptr<pa::ReportWriter> writer;

FAIL_IF_ERR(
Expand Down

0 comments on commit 3401b87

Please sign in to comment.