Skip to content

Commit

Permalink
Handle zero and single response
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Jan 30, 2024
1 parent 5063521 commit 5b27a83
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 18 deletions.
12 changes: 12 additions & 0 deletions src/c++/perf_analyzer/perf_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <iomanip>
#include <iostream>
#include <memory>
#include <optional>
#include <random>

#include "client_backend/client_backend.h"
Expand Down Expand Up @@ -83,6 +84,17 @@ class Range {
T step;
};

template <typename T>
std::optional<T>
CalculateAverage(const std::vector<T>& data)
{
if (data.empty()) {
return std::nullopt;
}
T sum = std::reduce(data.begin(), data.end());
return sum / data.size();
}

// Converts the datatype from tensorflow to perf analyzer space
// \param tf_dtype The data type string returned from the model metadata.
// \param datatype Returns the datatype in perf_analyzer space.
Expand Down
24 changes: 14 additions & 10 deletions src/c++/perf_analyzer/report_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -406,11 +406,20 @@ void
ReportWriter::WriteLLMMetrics(std::ostream& ofs)
{
auto [avg_first_token_latency, avg_t2t_latency] = CalculateLLMMetrics();
ofs << "," << avg_first_token_latency;
ofs << "," << avg_t2t_latency;

if (avg_first_token_latency.has_value()) {
ofs << "," << avg_first_token_latency.value();
} else {
ofs << ",n/a";
}
if (avg_t2t_latency.has_value()) {
ofs << "," << avg_t2t_latency.value();
} else {
ofs << ",n/a";
}
}

std::tuple<double, double>
std::tuple<std::optional<double>, std::optional<double>>
ReportWriter::CalculateLLMMetrics()
{
if (collector_->IsEmpty()) {
Expand Down Expand Up @@ -440,13 +449,8 @@ ReportWriter::CalculateLLMMetrics()
}
}

auto avg_first_token_latency =
std::reduce(first_token_latencies.begin(), first_token_latencies.end()) /
first_token_latencies.size();
auto avg_t2t_latency =
std::reduce(t2t_latencies.begin(), t2t_latencies.end()) /
t2t_latencies.size();

auto avg_first_token_latency = CalculateAverage(first_token_latencies);
auto avg_t2t_latency = CalculateAverage(t2t_latencies);
return std::make_tuple(avg_first_token_latency, avg_t2t_latency);
}

Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/report_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class ReportWriter {

/// Calculate LLM metrics (e.g., average first token latency) using the
/// profile data collected for decoupled model.
std::tuple<double, double> CalculateLLMMetrics();
std::tuple<std::optional<double>, std::optional<double>>
CalculateLLMMetrics();


const std::string& filename_{""};
Expand Down
57 changes: 50 additions & 7 deletions src/c++/perf_analyzer/test_report_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,61 @@ TEST_CASE("report_writer: WriteLLMMetrics")
pa::ProfileDataCollector::Create(&collector),
"failed to create profile data collector");

InferenceLoadMode infer_mode{10, 20.0}; // dummy values
InferenceLoadMode infer_mode{};

SUBCASE("request with zero response")
{
// TODO
CHECK(false);
uint64_t sequence_id1{123};
uint64_t request_timestamp1{1};
std::vector<uint64_t> response_timestamps1{};
RequestRecord rr1 = GenerateRequestRecord(
sequence_id1, request_timestamp1, response_timestamps1);

uint64_t sequence_id2{456};
uint64_t request_timestamp2{2};
std::vector<uint64_t> response_timestamps2{};
RequestRecord rr2 = GenerateRequestRecord(
sequence_id2, request_timestamp2, response_timestamps2);

std::vector<RequestRecord> request_records{rr1, rr2};
collector->AddData(infer_mode, std::move(request_records));

// Avg first token latency = n/a
// Avg token-to-token latency = n/a
TestReportWriter trw(collector);
std::ostringstream actual_output{};
trw.WriteLLMMetrics(actual_output);
const std::string expected_output{",n/a,n/a"};
CHECK(actual_output.str() == expected_output);
}

SUBCASE("request with single response")
SUBCASE("requests with single response")
{
// TODO
CHECK(false);
uint64_t sequence_id1{123};
uint64_t request_timestamp1{1};
std::vector<uint64_t> response_timestamps1{2};
RequestRecord rr1 = GenerateRequestRecord(
sequence_id1, request_timestamp1, response_timestamps1);

uint64_t sequence_id2{456};
uint64_t request_timestamp2{2};
std::vector<uint64_t> response_timestamps2{9};
RequestRecord rr2 = GenerateRequestRecord(
sequence_id2, request_timestamp2, response_timestamps2);

std::vector<RequestRecord> request_records{rr1, rr2};
collector->AddData(infer_mode, std::move(request_records));

// Avg first token latency
// = ((response1[0] - request1) + (response2[0] - request2)) / 2
// = ((2 - 1) + (9 - 2)) / 2 = 4 us
//
// Avg token-to-token latency = n/a
TestReportWriter trw(collector);
std::ostringstream actual_output{};
trw.WriteLLMMetrics(actual_output);
const std::string expected_output{",4,n/a"};
CHECK(actual_output.str() == expected_output);
}

SUBCASE("requests with multiple responses")
Expand All @@ -164,7 +207,7 @@ TEST_CASE("report_writer: WriteLLMMetrics")
collector->AddData(infer_mode, std::move(request_records));

// Avg first token latency
// = ((response1[0] - request1) + (response2[0] - request2) + ...) / 3
// = ((response1[0] - request1) + (response2[0] - request2)) / 2
// = ((4 - 1) + (6 - 2)) / 2 = 3.5 us
//
// Avg token-to-token latency
Expand Down

0 comments on commit 5b27a83

Please sign in to comment.