diff --git a/src/c++/perf_analyzer/CMakeLists.txt b/src/c++/perf_analyzer/CMakeLists.txt index bebdba4d5..dd0a6d53e 100644 --- a/src/c++/perf_analyzer/CMakeLists.txt +++ b/src/c++/perf_analyzer/CMakeLists.txt @@ -204,6 +204,7 @@ add_executable( mock_sequence_manager.h mock_profile_data_collector.h mock_profile_data_exporter.h + mock_report_writer.h test_dataloader.cc test_inference_profiler.cc test_command_line_parser.cc diff --git a/src/c++/perf_analyzer/report_writer.h b/src/c++/perf_analyzer/report_writer.h index 4d4c80f6d..c08130641 100644 --- a/src/c++/perf_analyzer/report_writer.h +++ b/src/c++/perf_analyzer/report_writer.h @@ -36,7 +36,7 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE -class TestReportWriter; +class NaggyMockReportWriter; #endif //============================================================================== @@ -116,7 +116,7 @@ class ReportWriter { const bool should_output_llm_metrics_{false}; #ifndef DOCTEST_CONFIG_DISABLE - friend TestReportWriter; + friend NaggyMockReportWriter; public: ReportWriter() = default; diff --git a/src/c++/perf_analyzer/test_report_writer.cc b/src/c++/perf_analyzer/test_report_writer.cc index a7ecaf7ef..00bcc28f0 100644 --- a/src/c++/perf_analyzer/test_report_writer.cc +++ b/src/c++/perf_analyzer/test_report_writer.cc @@ -27,35 +27,17 @@ #include #include "doctest.h" +#include "mock_model_parser.h" +#include "mock_report_writer.h" #include "profile_data_collector.h" #include "report_writer.h" #include "request_record.h" namespace triton { namespace perfanalyzer { -class TestReportWriter : ReportWriter { - public: - TestReportWriter() = default; - TestReportWriter(const std::shared_ptr& collector) - : ReportWriter( - "", false, std::vector{}, false, false, 0, nullptr, - false, collector, true) - { - } - void WriteGPUMetrics(std::ostream& ofs, const Metrics& metrics) - { - ReportWriter::WriteGPUMetrics(ofs, metrics); - } - - void WriteLLMMetrics(std::ostream& ofs, PerfStatus& status) - { - ReportWriter::WriteLLMMetrics(ofs, status); - } -}; - -TEST_CASE("testing WriteGPUMetrics") +TEST_CASE("report_writer: WriteGPUMetrics") { - TestReportWriter trw{}; + MockReportWriter mrw{}; Metrics m{}; m.gpu_utilization_per_gpu["a"] = 1.0; m.gpu_power_usage_per_gpu["a"] = 2.2; @@ -65,7 +47,7 @@ TEST_CASE("testing WriteGPUMetrics") SUBCASE("single gpu complete output") { - trw.WriteGPUMetrics(actual_output, m); + mrw.WriteGPUMetrics(actual_output, m); const std::string expected_output{",a:1;,a:2.2;,a:3;,a:4;"}; CHECK(actual_output.str() == expected_output); } @@ -73,7 +55,7 @@ TEST_CASE("testing WriteGPUMetrics") SUBCASE("single gpu missing data") { m.gpu_power_usage_per_gpu.erase("a"); - trw.WriteGPUMetrics(actual_output, m); + mrw.WriteGPUMetrics(actual_output, m); const std::string expected_output{",a:1;,,a:3;,a:4;"}; CHECK(actual_output.str() == expected_output); } @@ -87,7 +69,7 @@ TEST_CASE("testing WriteGPUMetrics") SUBCASE("multi gpu complete output") { - trw.WriteGPUMetrics(actual_output, m); + mrw.WriteGPUMetrics(actual_output, m); const std::string expected_output{ ",a:1;z:100;,a:2.2;z:222.2;,a:3;z:45;,a:4;z:89;"}; CHECK(actual_output.str() == expected_output); @@ -97,7 +79,7 @@ TEST_CASE("testing WriteGPUMetrics") { m.gpu_utilization_per_gpu.erase("z"); m.gpu_power_usage_per_gpu.erase("a"); - trw.WriteGPUMetrics(actual_output, m); + mrw.WriteGPUMetrics(actual_output, m); const std::string expected_output{",a:1;,z:222.2;,a:3;z:45;,a:4;z:89;"}; CHECK(actual_output.str() == expected_output); } @@ -250,9 +232,157 @@ TEST_CASE("report_writer: WriteLLMMetrics") status.concurrency = infer_mode.concurrency; status.request_rate = infer_mode.request_rate; - TestReportWriter trw(collector); - trw.WriteLLMMetrics(actual_output, status); + MockReportWriter mrw; + mrw.collector_ = collector; + mrw.WriteLLMMetrics(actual_output, status); CHECK(actual_output.str() == expected_output); } +TEST_CASE("report_writer: GenerateReport") +{ + std::string filename{"temp.csv"}; + std::vector summary; + std::shared_ptr mmp{nullptr}; + std::shared_ptr collector{nullptr}; + CHECK_NOTHROW_MESSAGE( + pa::ProfileDataCollector::Create(&collector), + "failed to create profile data collector"); + + // default parameters + bool target_concurrency{true}; + bool verbose_csv{false}; + bool include_server_stats{false}; + int32_t percentile{90}; + bool should_output_metrics{false}; + bool should_output_llm_metrics{false}; + bool is_sequence_model{false}; + bool is_decoupled_model{false}; + + std::ostringstream actual_output; + std::string expected_output; + + SUBCASE("single experiment") + { + mmp = std::make_shared( + is_sequence_model, is_decoupled_model); + + ClientSideStats css; + css.infer_per_sec = 150.123; + css.avg_send_time_ns = 2000; + css.avg_receive_time_ns = 3000; + + PerfStatus ps; + ps.concurrency = 10; + ps.client_stats = css; + + summary.push_back(ps); + + expected_output = + "Concurrency,Inferences/Second,Client Send,Client " + "Recv\n" + "10,150.123,2,3\n"; + } + + SUBCASE("multiple LLM experiments") + { + // set parameters + is_decoupled_model = true; + should_output_llm_metrics = true; + + mmp = std::make_shared( + is_sequence_model, is_decoupled_model); + + // first experiment + ClientSideStats css1; + css1.infer_per_sec = 150; + css1.responses_per_sec = 123.456; + css1.avg_send_time_ns = 2000; + css1.avg_receive_time_ns = 3000; + + PerfStatus ps1; + ps1.concurrency = 10; + ps1.client_stats = css1; + + summary.push_back(ps1); + + InferenceLoadMode infer_mode1{ps1.concurrency, ps1.request_rate}; + uint64_t sequence_id1{123}; + uint64_t request_timestamp1{1}; + std::vector response_timestamps1{4, 5, 8, 10}; + RequestRecord rr1 = GenerateRequestRecord( + sequence_id1, request_timestamp1, response_timestamps1); + + uint64_t sequence_id2{456}; + uint64_t request_timestamp2{2}; + std::vector response_timestamps2{6, 7, 10, 12}; + RequestRecord rr2 = GenerateRequestRecord( + sequence_id2, request_timestamp2, response_timestamps2); + + std::vector request_records1{rr1, rr2}; + collector->AddData(infer_mode1, std::move(request_records1)); + + // second experiment + ClientSideStats css2; + css2.infer_per_sec = 345.12; + css2.responses_per_sec = 10.789; + css2.avg_send_time_ns = 4000; + css2.avg_receive_time_ns = 5000; + + PerfStatus ps2; + ps2.concurrency = 30; + ps2.client_stats = css2; + + summary.push_back(ps2); + + InferenceLoadMode infer_mode2{ps2.concurrency, ps2.request_rate}; + uint64_t sequence_id3{123}; + uint64_t request_timestamp3{1}; + std::vector response_timestamps3{5, 8, 9, 11}; + RequestRecord rr3 = GenerateRequestRecord( + sequence_id3, request_timestamp3, response_timestamps3); + + uint64_t sequence_id4{456}; + uint64_t request_timestamp4{2}; + std::vector response_timestamps4{10, 15, 19, 22}; + RequestRecord rr4 = GenerateRequestRecord( + sequence_id4, request_timestamp4, response_timestamps4); + + std::vector request_records2{rr3, rr4}; + collector->AddData(infer_mode2, std::move(request_records2)); + + expected_output = + "Concurrency,Inferences/Second,Response Throughput,Client Send,Client " + "Recv,Avg First Token Latency,Avg Token-to-Token Latency\n" + "10,150,123.456,2,3,3.5,2\n" + "30,345.12,10.789,4,5,6,3\n"; + } + + MockReportWriter mrw{ + filename, + summary, + mmp, + collector, + target_concurrency, + verbose_csv, + include_server_stats, + percentile, + should_output_metrics, + should_output_llm_metrics, + }; + mrw.GenerateReport(); + + // read from temp.csv + std::ifstream input_file(filename); + std::string line; + while (std::getline(input_file, line)) { + actual_output << line << "\n"; + } + input_file.close(); + + CHECK(actual_output.str() == expected_output); + + // clean up + std::remove(filename.c_str()); +} + }} // namespace triton::perfanalyzer