Add a test case and handle multiple experiments

triton-inference-server · Feb 2, 2024 · 2119a28 · 2119a28
1 parent 10b26a0
commit 2119a28
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 57 deletions.
diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions

diff --git a/src/c++/perf_analyzer/profile_data_collector.cc b/src/c++/perf_analyzer/profile_data_collector.cc
@@ -82,4 +82,16 @@ ProfileDataCollector::AddData(
   }
 }
 
+std::optional<std::reference_wrapper<Experiment>>
+ProfileDataCollector::GetExperiment(InferenceLoadMode& id)
+{
+  auto it = FindExperiment(id);
+  if (it == experiments_.end()) {
+    std::cerr << "No experiment with concurrency: " << id.concurrency
+              << "and request rate: " << id.request_rate << "found."
+              << std::endl;
+    return std::nullopt;
+  }
+  return *it;
+}
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/profile_data_collector.h b/src/c++/perf_analyzer/profile_data_collector.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <algorithm>
+#include <functional>
 #include <map>
 #include <tuple>
 
@@ -95,7 +96,13 @@ class ProfileDataCollector {
   void AddData(
       InferenceLoadMode& id, std::vector<RequestRecord>&& request_records);
 
-  /// Get the experiment data for the profile
+  /// Get the experiment data that corresponds to the given inference load mode
+  /// @param id Identifier for the experiment
+  /// @return The experiment data or the null-typed object if not found
+  std::optional<std::reference_wrapper<Experiment>> GetExperiment(
+      InferenceLoadMode& id);
+
+  /// Get the entire experiment data for the profile
   /// @return Experiment data
   std::vector<Experiment>& GetData() { return experiments_; }
 

diff --git a/src/c++/perf_analyzer/report_writer.cc b/src/c++/perf_analyzer/report_writer.cc
@@ -247,7 +247,7 @@ ReportWriter::GenerateReport()
         }
       }
       if (should_output_llm_metrics_) {
-        WriteLLMMetrics(ofs);
+        WriteLLMMetrics(ofs, status);
       }
       ofs << std::endl;
     }
@@ -403,49 +403,49 @@ ReportWriter::WriteGPUMetrics(std::ostream& ofs, const Metrics& metric)
 }
 
 void
-ReportWriter::WriteLLMMetrics(std::ostream& ofs)
+ReportWriter::WriteLLMMetrics(std::ostream& ofs, const PerfStatus& status)
 {
-  auto [avg_first_token_latency, avg_t2t_latency] = CalculateLLMMetrics();
+  InferenceLoadMode id{status.concurrency, status.request_rate};
+  auto experiment = collector_->GetExperiment(id);
+
+  if (!experiment.has_value()) {
+    ofs << ",N/A,N/A";
+    return;
+  }
+
+  auto [avg_first_token_latency, avg_t2t_latency] =
+      CalculateLLMMetrics(*experiment);
 
   if (avg_first_token_latency.has_value()) {
-    ofs << "," << avg_first_token_latency.value();
+    ofs << "," << *avg_first_token_latency;
   } else {
-    ofs << ",n/a";
+    ofs << ",N/A";
   }
   if (avg_t2t_latency.has_value()) {
-    ofs << "," << avg_t2t_latency.value();
+    ofs << "," << *avg_t2t_latency;
   } else {
-    ofs << ",n/a";
+    ofs << ",N/A";
   }
 }
 
 std::tuple<std::optional<double>, std::optional<double>>
-ReportWriter::CalculateLLMMetrics()
+ReportWriter::CalculateLLMMetrics(const Experiment& experiment)
 {
-  if (collector_->IsEmpty()) {
-    throw PerfAnalyzerException(
-        "Attempted to write LLM metrics when profile data is empty.",
-        GENERIC_ERROR);
-  }
-
-  const std::vector<Experiment>& experiments{collector_->GetData()};
   std::vector<double> first_token_latencies;
   std::vector<double> t2t_latencies;
 
-  for (const auto& exp : experiments) {
-    for (const auto& req : exp.requests) {
-      // Collect first token latencies
-      if (!req.response_times_.empty()) {
-        const std::chrono::duration<double, std::micro> ttft{
-            req.response_times_.front() - req.start_time_};
-        first_token_latencies.push_back(ttft.count());
-      }
-      // Collect token-to-token (T2T) latencies
-      for (size_t i = 1; i < req.response_times_.size(); i++) {
-        const std::chrono::duration<double, std::micro> t2t{
-            req.response_times_[i] - req.response_times_[i - 1]};
-        t2t_latencies.push_back(t2t.count());
-      }
+  for (const auto& req : experiment.requests) {
+    // Collect first token latencies
+    if (!req.response_times_.empty()) {
+      const std::chrono::duration<double, std::micro> ttft{
+          req.response_times_.front() - req.start_time_};
+      first_token_latencies.push_back(ttft.count());
+    }
+    // Collect token-to-token (T2T) latencies
+    for (size_t i = 1; i < req.response_times_.size(); i++) {
+      const std::chrono::duration<double, std::micro> t2t{
+          req.response_times_[i] - req.response_times_[i - 1]};
+      t2t_latencies.push_back(t2t.count());
     }
   }
 

diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h
@@ -83,7 +83,7 @@ class ReportWriter {
 
   /// Output LLM metrics (e.g. average first token latency) to a stream.
   /// \param ofs A stream to output the csv data
-  void WriteLLMMetrics(std::ostream& ofs);
+  void WriteLLMMetrics(std::ostream& ofs, const PerfStatus& status);
 
  private:
   ReportWriter(
@@ -96,9 +96,11 @@ class ReportWriter {
       const bool should_output_llm_metrics);
 
   /// Calculate LLM metrics (e.g., average first token latency) using the
-  /// profile data collected for decoupled model.
-  std::tuple<std::optional<double>, std::optional<double>>
-  CalculateLLMMetrics();
+  /// profile data collected during a single inference experiment.
+  /// \param experiment A profile data that contains request and response
+  /// timestamps of a single inference experiment.
+  std::tuple<std::optional<double>, std::optional<double>> CalculateLLMMetrics(
+      const Experiment& experiment);
 
 
   const std::string& filename_{""};

diff --git a/src/c++/perf_analyzer/test_report_writer.cc b/src/c++/perf_analyzer/test_report_writer.cc
@@ -47,9 +47,9 @@ class TestReportWriter : ReportWriter {
     ReportWriter::WriteGPUMetrics(ofs, metrics);
   }
 
-  void WriteLLMMetrics(std::ostream& ofs)
+  void WriteLLMMetrics(std::ostream& ofs, PerfStatus& status)
   {
-    ReportWriter::WriteLLMMetrics(ofs);
+    ReportWriter::WriteLLMMetrics(ofs, status);
   }
 };
 
@@ -132,7 +132,9 @@ TEST_CASE("report_writer: WriteLLMMetrics")
       pa::ProfileDataCollector::Create(&collector),
       "failed to create profile data collector");
 
-  InferenceLoadMode infer_mode{};
+  InferenceLoadMode infer_mode;
+  std::ostringstream actual_output;
+  std::string expected_output;
 
   SUBCASE("requests with zero response")
   {
@@ -151,13 +153,9 @@ TEST_CASE("report_writer: WriteLLMMetrics")
     std::vector<RequestRecord> request_records{rr1, rr2};
     collector->AddData(infer_mode, std::move(request_records));
 
-    // Avg first token latency = n/a
-    // Avg token-to-token latency = n/a
-    TestReportWriter trw(collector);
-    std::ostringstream actual_output{};
-    trw.WriteLLMMetrics(actual_output);
-    const std::string expected_output{",n/a,n/a"};
-    CHECK(actual_output.str() == expected_output);
+    // Avg first token latency = N/A
+    // Avg token-to-token latency = N/A
+    expected_output = ",N/A,N/A";
   }
 
   SUBCASE("requests with single response")
@@ -181,15 +179,11 @@ TEST_CASE("report_writer: WriteLLMMetrics")
     // = ((response1[0] - request1) + (response2[0] - request2)) / 2
     // = ((2 - 1) + (9 - 2)) / 2 = 4 us
     //
-    // Avg token-to-token latency = n/a
-    TestReportWriter trw(collector);
-    std::ostringstream actual_output{};
-    trw.WriteLLMMetrics(actual_output);
-    const std::string expected_output{",4,n/a"};
-    CHECK(actual_output.str() == expected_output);
+    // Avg token-to-token latency = N/A
+    expected_output = ",4,N/A";
   }
 
-  SUBCASE("requests with multiple responses")
+  SUBCASE("requests with many responses")
   {
     uint64_t sequence_id1{123};
     uint64_t request_timestamp1{1};
@@ -213,12 +207,52 @@ TEST_CASE("report_writer: WriteLLMMetrics")
     // Avg token-to-token latency
     // = ((res1[i] - res1[i - 1]) + ... + (res2[i]] - res2[i - 1]) + ...) / 6
     // = ((5-4) + (8-5) + (10-8) + (7-6) + (10-7) + (12-10)) / 6 = 2 us
-    TestReportWriter trw(collector);
-    std::ostringstream actual_output{};
-    trw.WriteLLMMetrics(actual_output);
-    const std::string expected_output{",3.5,2"};
-    CHECK(actual_output.str() == expected_output);
+    expected_output = ",3.5,2";
   }
+
+  SUBCASE("requests with mixture of responses")
+  {
+    // zero response
+    uint64_t sequence_id1{123};
+    uint64_t request_timestamp1{1};
+    std::vector<uint64_t> response_timestamps1{};
+    RequestRecord rr1 = GenerateRequestRecord(
+        sequence_id1, request_timestamp1, response_timestamps1);
+
+    // single response
+    uint64_t sequence_id2{456};
+    uint64_t request_timestamp2{2};
+    std::vector<uint64_t> response_timestamps2{8};
+    RequestRecord rr2 = GenerateRequestRecord(
+        sequence_id2, request_timestamp2, response_timestamps2);
+
+    // many responses
+    uint64_t sequence_id3{456};
+    uint64_t request_timestamp3{4};
+    std::vector<uint64_t> response_timestamps3{6, 7, 10, 12};
+    RequestRecord rr3 = GenerateRequestRecord(
+        sequence_id3, request_timestamp3, response_timestamps3);
+
+    std::vector<RequestRecord> request_records{rr1, rr2, rr3};
+    collector->AddData(infer_mode, std::move(request_records));
+
+    // Avg first token latency
+    // = ((response2[0] - request2) + (response3[0] - request3)) / 2
+    // = ((8 - 2) + (6 - 4)) / 2 = 4 us
+    //
+    // Avg token-to-token latency
+    // = (... + (response3[i] - response3[i - 1]) + ...) / 3
+    // = ((7 - 6) + (10 - 7) + (12 - 10)) / 3 = 2 us
+    expected_output = ",4,2";
+  }
+
+  PerfStatus status;
+  status.concurrency = infer_mode.concurrency;
+  status.request_rate = infer_mode.request_rate;
+
+  TestReportWriter trw(collector);
+  trw.WriteLLMMetrics(actual_output, status);
+  CHECK(actual_output.str() == expected_output);
 }
 
 }}  // namespace triton::perfanalyzer