Extract llm metric calculation into new function

triton-inference-server · Jan 25, 2024 · c61d71a · c61d71a
1 parent d86e830
commit c61d71a
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 2 deletions.
diff --git a/src/c++/perf_analyzer/report_writer.cc b/src/c++/perf_analyzer/report_writer.cc
@@ -409,6 +409,14 @@ ReportWriter::WriteGpuMetrics(std::ostream& ofs, const Metrics& metric)
 
 void
 ReportWriter::WriteLlmMetrics(std::ostream& ofs)
+{
+  auto [avg_first_token_latency, avg_t2t_latency] = CalculateLlmMetrics();
+  ofs << "," << avg_first_token_latency;
+  ofs << "," << avg_t2t_latency;
+}
+
+std::tuple<double, double>
+ReportWriter::CalculateLlmMetrics()
 {
   const std::vector<Experiment>& experiments{collector_->GetData()};
   std::vector<double> first_token_latencies;
@@ -437,8 +445,7 @@ ReportWriter::WriteLlmMetrics(std::ostream& ofs)
       std::reduce(t2t_latencies.begin(), t2t_latencies.end()) /
       t2t_latencies.size();
 
-  ofs << "," << avg_first_token_latency;
-  ofs << "," << avg_t2t_latency;
+  return std::make_tuple(avg_first_token_latency, avg_t2t_latency);
 }
 
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h
@@ -95,6 +95,10 @@ class ReportWriter {
       const std::shared_ptr<ProfileDataCollector>& collector,
       const bool should_output_llm_metrics);
 
+  /// Calculate LLM metrics (e.g., average first token latency) using the
+  /// profile data collected for decoupled model.
+  std::tuple<double, double> CalculateLlmMetrics();
+
 
   const std::string& filename_{""};
   const bool target_concurrency_{true};