Handle zero and single response

triton-inference-server · Jan 30, 2024 · 5b27a83 · 5b27a83
1 parent 5063521
commit 5b27a83
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 18 deletions.
diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h
@@ -36,6 +36,7 @@
 #include <iomanip>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <random>
 
 #include "client_backend/client_backend.h"
@@ -83,6 +84,17 @@ class Range {
   T step;
 };
 
+template <typename T>
+std::optional<T>
+CalculateAverage(const std::vector<T>& data)
+{
+  if (data.empty()) {
+    return std::nullopt;
+  }
+  T sum = std::reduce(data.begin(), data.end());
+  return sum / data.size();
+}
+
 // Converts the datatype from tensorflow to perf analyzer space
 // \param tf_dtype The data type string returned from the model metadata.
 // \param datatype Returns the datatype in perf_analyzer space.

diff --git a/src/c++/perf_analyzer/report_writer.cc b/src/c++/perf_analyzer/report_writer.cc
@@ -406,11 +406,20 @@ void
 ReportWriter::WriteLLMMetrics(std::ostream& ofs)
 {
   auto [avg_first_token_latency, avg_t2t_latency] = CalculateLLMMetrics();
-  ofs << "," << avg_first_token_latency;
-  ofs << "," << avg_t2t_latency;
+
+  if (avg_first_token_latency.has_value()) {
+    ofs << "," << avg_first_token_latency.value();
+  } else {
+    ofs << ",n/a";
+  }
+  if (avg_t2t_latency.has_value()) {
+    ofs << "," << avg_t2t_latency.value();
+  } else {
+    ofs << ",n/a";
+  }
 }
 
-std::tuple<double, double>
+std::tuple<std::optional<double>, std::optional<double>>
 ReportWriter::CalculateLLMMetrics()
 {
   if (collector_->IsEmpty()) {
@@ -440,13 +449,8 @@ ReportWriter::CalculateLLMMetrics()
     }
   }
 
-  auto avg_first_token_latency =
-      std::reduce(first_token_latencies.begin(), first_token_latencies.end()) /
-      first_token_latencies.size();
-  auto avg_t2t_latency =
-      std::reduce(t2t_latencies.begin(), t2t_latencies.end()) /
-      t2t_latencies.size();
-
+  auto avg_first_token_latency = CalculateAverage(first_token_latencies);
+  auto avg_t2t_latency = CalculateAverage(t2t_latencies);
   return std::make_tuple(avg_first_token_latency, avg_t2t_latency);
 }
 

diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h
@@ -97,7 +97,8 @@ class ReportWriter {
 
   /// Calculate LLM metrics (e.g., average first token latency) using the
   /// profile data collected for decoupled model.
-  std::tuple<double, double> CalculateLLMMetrics();
+  std::tuple<std::optional<double>, std::optional<double>>
+  CalculateLLMMetrics();
 
 
   const std::string& filename_{""};

diff --git a/src/c++/perf_analyzer/test_report_writer.cc b/src/c++/perf_analyzer/test_report_writer.cc
@@ -132,18 +132,61 @@ TEST_CASE("report_writer: WriteLLMMetrics")
       pa::ProfileDataCollector::Create(&collector),
       "failed to create profile data collector");
 
-  InferenceLoadMode infer_mode{10, 20.0};  // dummy values
+  InferenceLoadMode infer_mode{};
 
   SUBCASE("request with zero response")
   {
-    // TODO
-    CHECK(false);
+    uint64_t sequence_id1{123};
+    uint64_t request_timestamp1{1};
+    std::vector<uint64_t> response_timestamps1{};
+    RequestRecord rr1 = GenerateRequestRecord(
+        sequence_id1, request_timestamp1, response_timestamps1);
+
+    uint64_t sequence_id2{456};
+    uint64_t request_timestamp2{2};
+    std::vector<uint64_t> response_timestamps2{};
+    RequestRecord rr2 = GenerateRequestRecord(
+        sequence_id2, request_timestamp2, response_timestamps2);
+
+    std::vector<RequestRecord> request_records{rr1, rr2};
+    collector->AddData(infer_mode, std::move(request_records));
+
+    // Avg first token latency = n/a
+    // Avg token-to-token latency = n/a
+    TestReportWriter trw(collector);
+    std::ostringstream actual_output{};
+    trw.WriteLLMMetrics(actual_output);
+    const std::string expected_output{",n/a,n/a"};
+    CHECK(actual_output.str() == expected_output);
   }
 
-  SUBCASE("request with single response")
+  SUBCASE("requests with single response")
   {
-    // TODO
-    CHECK(false);
+    uint64_t sequence_id1{123};
+    uint64_t request_timestamp1{1};
+    std::vector<uint64_t> response_timestamps1{2};
+    RequestRecord rr1 = GenerateRequestRecord(
+        sequence_id1, request_timestamp1, response_timestamps1);
+
+    uint64_t sequence_id2{456};
+    uint64_t request_timestamp2{2};
+    std::vector<uint64_t> response_timestamps2{9};
+    RequestRecord rr2 = GenerateRequestRecord(
+        sequence_id2, request_timestamp2, response_timestamps2);
+
+    std::vector<RequestRecord> request_records{rr1, rr2};
+    collector->AddData(infer_mode, std::move(request_records));
+
+    // Avg first token latency
+    // = ((response1[0] - request1) + (response2[0] - request2)) / 2
+    // = ((2 - 1) + (9 - 2)) / 2 = 4 us
+    //
+    // Avg token-to-token latency = n/a
+    TestReportWriter trw(collector);
+    std::ostringstream actual_output{};
+    trw.WriteLLMMetrics(actual_output);
+    const std::string expected_output{",4,n/a"};
+    CHECK(actual_output.str() == expected_output);
   }
 
   SUBCASE("requests with multiple responses")
@@ -164,7 +207,7 @@ TEST_CASE("report_writer: WriteLLMMetrics")
     collector->AddData(infer_mode, std::move(request_records));
 
     // Avg first token latency
-    // = ((response1[0] - request1) + (response2[0] - request2) + ...) / 3
+    // = ((response1[0] - request1) + (response2[0] - request2)) / 2
     // = ((4 - 1) + (6 - 2)) / 2 = 3.5 us
     //
     // Avg token-to-token latency