Skip to content

Commit

Permalink
Add a test case and handle multiple experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Feb 2, 2024
1 parent 10b26a0 commit 2119a28
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 57 deletions.
2 changes: 1 addition & 1 deletion src/c++/perf_analyzer/perf_utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down
12 changes: 12 additions & 0 deletions src/c++/perf_analyzer/profile_data_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,16 @@ ProfileDataCollector::AddData(
}
}

std::optional<std::reference_wrapper<Experiment>>
ProfileDataCollector::GetExperiment(InferenceLoadMode& id)
{
auto it = FindExperiment(id);
if (it == experiments_.end()) {
std::cerr << "No experiment with concurrency: " << id.concurrency
<< "and request rate: " << id.request_rate << "found."
<< std::endl;
return std::nullopt;
}
return *it;
}
}} // namespace triton::perfanalyzer
9 changes: 8 additions & 1 deletion src/c++/perf_analyzer/profile_data_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#pragma once

#include <algorithm>
#include <functional>
#include <map>
#include <tuple>

Expand Down Expand Up @@ -95,7 +96,13 @@ class ProfileDataCollector {
void AddData(
InferenceLoadMode& id, std::vector<RequestRecord>&& request_records);

/// Get the experiment data for the profile
/// Get the experiment data that corresponds to the given inference load mode
/// @param id Identifier for the experiment
/// @return The experiment data or the null-typed object if not found
std::optional<std::reference_wrapper<Experiment>> GetExperiment(
InferenceLoadMode& id);

/// Get the entire experiment data for the profile
/// @return Experiment data
std::vector<Experiment>& GetData() { return experiments_; }

Expand Down
58 changes: 29 additions & 29 deletions src/c++/perf_analyzer/report_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ ReportWriter::GenerateReport()
}
}
if (should_output_llm_metrics_) {
WriteLLMMetrics(ofs);
WriteLLMMetrics(ofs, status);
}
ofs << std::endl;
}
Expand Down Expand Up @@ -403,49 +403,49 @@ ReportWriter::WriteGPUMetrics(std::ostream& ofs, const Metrics& metric)
}

void
ReportWriter::WriteLLMMetrics(std::ostream& ofs)
ReportWriter::WriteLLMMetrics(std::ostream& ofs, const PerfStatus& status)
{
auto [avg_first_token_latency, avg_t2t_latency] = CalculateLLMMetrics();
InferenceLoadMode id{status.concurrency, status.request_rate};
auto experiment = collector_->GetExperiment(id);

if (!experiment.has_value()) {
ofs << ",N/A,N/A";
return;
}

auto [avg_first_token_latency, avg_t2t_latency] =
CalculateLLMMetrics(*experiment);

if (avg_first_token_latency.has_value()) {
ofs << "," << avg_first_token_latency.value();
ofs << "," << *avg_first_token_latency;
} else {
ofs << ",n/a";
ofs << ",N/A";
}
if (avg_t2t_latency.has_value()) {
ofs << "," << avg_t2t_latency.value();
ofs << "," << *avg_t2t_latency;
} else {
ofs << ",n/a";
ofs << ",N/A";
}
}

std::tuple<std::optional<double>, std::optional<double>>
ReportWriter::CalculateLLMMetrics()
ReportWriter::CalculateLLMMetrics(const Experiment& experiment)
{
if (collector_->IsEmpty()) {
throw PerfAnalyzerException(
"Attempted to write LLM metrics when profile data is empty.",
GENERIC_ERROR);
}

const std::vector<Experiment>& experiments{collector_->GetData()};
std::vector<double> first_token_latencies;
std::vector<double> t2t_latencies;

for (const auto& exp : experiments) {
for (const auto& req : exp.requests) {
// Collect first token latencies
if (!req.response_times_.empty()) {
const std::chrono::duration<double, std::micro> ttft{
req.response_times_.front() - req.start_time_};
first_token_latencies.push_back(ttft.count());
}
// Collect token-to-token (T2T) latencies
for (size_t i = 1; i < req.response_times_.size(); i++) {
const std::chrono::duration<double, std::micro> t2t{
req.response_times_[i] - req.response_times_[i - 1]};
t2t_latencies.push_back(t2t.count());
}
for (const auto& req : experiment.requests) {
// Collect first token latencies
if (!req.response_times_.empty()) {
const std::chrono::duration<double, std::micro> ttft{
req.response_times_.front() - req.start_time_};
first_token_latencies.push_back(ttft.count());
}
// Collect token-to-token (T2T) latencies
for (size_t i = 1; i < req.response_times_.size(); i++) {
const std::chrono::duration<double, std::micro> t2t{
req.response_times_[i] - req.response_times_[i - 1]};
t2t_latencies.push_back(t2t.count());
}
}

Expand Down
10 changes: 6 additions & 4 deletions src/c++/perf_analyzer/report_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class ReportWriter {

/// Output LLM metrics (e.g. average first token latency) to a stream.
/// \param ofs A stream to output the csv data
void WriteLLMMetrics(std::ostream& ofs);
void WriteLLMMetrics(std::ostream& ofs, const PerfStatus& status);

private:
ReportWriter(
Expand All @@ -96,9 +96,11 @@ class ReportWriter {
const bool should_output_llm_metrics);

/// Calculate LLM metrics (e.g., average first token latency) using the
/// profile data collected for decoupled model.
std::tuple<std::optional<double>, std::optional<double>>
CalculateLLMMetrics();
/// profile data collected during a single inference experiment.
/// \param experiment A profile data that contains request and response
/// timestamps of a single inference experiment.
std::tuple<std::optional<double>, std::optional<double>> CalculateLLMMetrics(
const Experiment& experiment);


const std::string& filename_{""};
Expand Down
78 changes: 56 additions & 22 deletions src/c++/perf_analyzer/test_report_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ class TestReportWriter : ReportWriter {
ReportWriter::WriteGPUMetrics(ofs, metrics);
}

void WriteLLMMetrics(std::ostream& ofs)
void WriteLLMMetrics(std::ostream& ofs, PerfStatus& status)
{
ReportWriter::WriteLLMMetrics(ofs);
ReportWriter::WriteLLMMetrics(ofs, status);
}
};

Expand Down Expand Up @@ -132,7 +132,9 @@ TEST_CASE("report_writer: WriteLLMMetrics")
pa::ProfileDataCollector::Create(&collector),
"failed to create profile data collector");

InferenceLoadMode infer_mode{};
InferenceLoadMode infer_mode;
std::ostringstream actual_output;
std::string expected_output;

SUBCASE("requests with zero response")
{
Expand All @@ -151,13 +153,9 @@ TEST_CASE("report_writer: WriteLLMMetrics")
std::vector<RequestRecord> request_records{rr1, rr2};
collector->AddData(infer_mode, std::move(request_records));

// Avg first token latency = n/a
// Avg token-to-token latency = n/a
TestReportWriter trw(collector);
std::ostringstream actual_output{};
trw.WriteLLMMetrics(actual_output);
const std::string expected_output{",n/a,n/a"};
CHECK(actual_output.str() == expected_output);
// Avg first token latency = N/A
// Avg token-to-token latency = N/A
expected_output = ",N/A,N/A";
}

SUBCASE("requests with single response")
Expand All @@ -181,15 +179,11 @@ TEST_CASE("report_writer: WriteLLMMetrics")
// = ((response1[0] - request1) + (response2[0] - request2)) / 2
// = ((2 - 1) + (9 - 2)) / 2 = 4 us
//
// Avg token-to-token latency = n/a
TestReportWriter trw(collector);
std::ostringstream actual_output{};
trw.WriteLLMMetrics(actual_output);
const std::string expected_output{",4,n/a"};
CHECK(actual_output.str() == expected_output);
// Avg token-to-token latency = N/A
expected_output = ",4,N/A";
}

SUBCASE("requests with multiple responses")
SUBCASE("requests with many responses")
{
uint64_t sequence_id1{123};
uint64_t request_timestamp1{1};
Expand All @@ -213,12 +207,52 @@ TEST_CASE("report_writer: WriteLLMMetrics")
// Avg token-to-token latency
// = ((res1[i] - res1[i - 1]) + ... + (res2[i]] - res2[i - 1]) + ...) / 6
// = ((5-4) + (8-5) + (10-8) + (7-6) + (10-7) + (12-10)) / 6 = 2 us
TestReportWriter trw(collector);
std::ostringstream actual_output{};
trw.WriteLLMMetrics(actual_output);
const std::string expected_output{",3.5,2"};
CHECK(actual_output.str() == expected_output);
expected_output = ",3.5,2";
}

SUBCASE("requests with mixture of responses")
{
// zero response
uint64_t sequence_id1{123};
uint64_t request_timestamp1{1};
std::vector<uint64_t> response_timestamps1{};
RequestRecord rr1 = GenerateRequestRecord(
sequence_id1, request_timestamp1, response_timestamps1);

// single response
uint64_t sequence_id2{456};
uint64_t request_timestamp2{2};
std::vector<uint64_t> response_timestamps2{8};
RequestRecord rr2 = GenerateRequestRecord(
sequence_id2, request_timestamp2, response_timestamps2);

// many responses
uint64_t sequence_id3{456};
uint64_t request_timestamp3{4};
std::vector<uint64_t> response_timestamps3{6, 7, 10, 12};
RequestRecord rr3 = GenerateRequestRecord(
sequence_id3, request_timestamp3, response_timestamps3);

std::vector<RequestRecord> request_records{rr1, rr2, rr3};
collector->AddData(infer_mode, std::move(request_records));

// Avg first token latency
// = ((response2[0] - request2) + (response3[0] - request3)) / 2
// = ((8 - 2) + (6 - 4)) / 2 = 4 us
//
// Avg token-to-token latency
// = (... + (response3[i] - response3[i - 1]) + ...) / 3
// = ((7 - 6) + (10 - 7) + (12 - 10)) / 3 = 2 us
expected_output = ",4,2";
}

PerfStatus status;
status.concurrency = infer_mode.concurrency;
status.request_rate = infer_mode.request_rate;

TestReportWriter trw(collector);
trw.WriteLLMMetrics(actual_output, status);
CHECK(actual_output.str() == expected_output);
}

}} // namespace triton::perfanalyzer

0 comments on commit 2119a28

Please sign in to comment.