From c61d71a9e160df6e5fad67eb3c9e80c5f2537075 Mon Sep 17 00:00:00 2001 From: Hyunjae Woo Date: Thu, 25 Jan 2024 13:56:41 -0800 Subject: [PATCH] Extract llm metric calculation into new function --- src/c++/perf_analyzer/report_writer.cc | 11 +++++++++-- src/c++/perf_analyzer/report_writer.h | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/c++/perf_analyzer/report_writer.cc b/src/c++/perf_analyzer/report_writer.cc index ebec24326..56a5fc140 100644 --- a/src/c++/perf_analyzer/report_writer.cc +++ b/src/c++/perf_analyzer/report_writer.cc @@ -409,6 +409,14 @@ ReportWriter::WriteGpuMetrics(std::ostream& ofs, const Metrics& metric) void ReportWriter::WriteLlmMetrics(std::ostream& ofs) +{ + auto [avg_first_token_latency, avg_t2t_latency] = CalculateLlmMetrics(); + ofs << "," << avg_first_token_latency; + ofs << "," << avg_t2t_latency; +} + +std::tuple +ReportWriter::CalculateLlmMetrics() { const std::vector& experiments{collector_->GetData()}; std::vector first_token_latencies; @@ -437,8 +445,7 @@ ReportWriter::WriteLlmMetrics(std::ostream& ofs) std::reduce(t2t_latencies.begin(), t2t_latencies.end()) / t2t_latencies.size(); - ofs << "," << avg_first_token_latency; - ofs << "," << avg_t2t_latency; + return std::make_tuple(avg_first_token_latency, avg_t2t_latency); } }} // namespace triton::perfanalyzer diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h index 2bed79f54..5e6fb2855 100644 --- a/src/c++/perf_analyzer/report_writer.h +++ b/src/c++/perf_analyzer/report_writer.h @@ -95,6 +95,10 @@ class ReportWriter { const std::shared_ptr& collector, const bool should_output_llm_metrics); + /// Calculate LLM metrics (e.g., average first token latency) using the + /// profile data collected for decoupled model. + std::tuple CalculateLlmMetrics(); + const std::string& filename_{""}; const bool target_concurrency_{true};