From b913d58f015fd79f6779d4d38d021960fc98b778 Mon Sep 17 00:00:00 2001 From: Hyunjae Woo Date: Fri, 8 Mar 2024 16:36:42 -0800 Subject: [PATCH] Fix unit tests --- .../genai-pa/tests/test_llm_metrics.py | 87 ++++++++++++------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py index bb2fd1008..0530b1471 100755 --- a/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py @@ -51,19 +51,22 @@ def prepare_triton_profile_data(self) -> None: { "timestamp": 1, "response_timestamps": [3, 5, 8], + # LLMProfileDataParser preprocessse the responses + # from triton server and removes first few chars. + # Add whitespace to avoid valid chars being removed. "response_outputs": [ - {"text_output": "dogs"}, - {"text_output": "are"}, - {"text_output": "cool"}, + {"text_output": " dogs"}, + {"text_output": " are"}, + {"text_output": " cool"}, ], }, { "timestamp": 2, "response_timestamps": [4, 7, 11], "response_outputs": [ - {"text_output": "I"}, - {"text_output": "don't"}, - {"text_output": "cook food"}, + {"text_output": " I"}, + {"text_output": " don't"}, + {"text_output": " cook food"}, ], }, ], @@ -78,19 +81,19 @@ def prepare_triton_profile_data(self) -> None: "timestamp": 5, "response_timestamps": [7, 8, 13, 18], "response_outputs": [ - {"text_output": "cats"}, - {"text_output": "are"}, - {"text_output": "cool"}, - {"text_output": "too"}, + {"text_output": " cats"}, + {"text_output": " are"}, + {"text_output": " cool"}, + {"text_output": " too"}, ], }, { "timestamp": 3, "response_timestamps": [6, 8, 11], "response_outputs": [ - {"text_output": "it's"}, - {"text_output": "very"}, - {"text_output": "simple work"}, + {"text_output": " it's"}, + {"text_output": " very"}, + {"text_output": " simple work"}, ], }, ], @@ -199,64 +202,76 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None: ) # experiment 1 statistics - stat = pd.get_statistics(infer_mode="concurrency", load_level=10) + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") assert stat.avg_time_to_first_token == 2 assert stat.avg_inter_token_latency == 2.25 - assert stat.avg_output_token_throughput == pytest.approx(31 / 63) + avg_ott = (31 / 63) * 1e9 # ns to sec + assert stat.avg_output_token_throughput == pytest.approx(avg_ott) assert stat.avg_num_output_token == 4 assert stat.p50_time_to_first_token == 2 assert stat.p50_inter_token_latency == 2 - assert stat.p50_output_token_throughput == pytest.approx(31 / 63) + p50_ott = (31 / 63) * 1e9 # ns to sec + assert stat.p50_output_token_throughput == pytest.approx(p50_ott) assert stat.p50_num_output_token == 4 assert stat.min_time_to_first_token == 2 assert stat.min_inter_token_latency == 2 - assert stat.min_output_token_throughput == pytest.approx(3 / 7) + min_ott = (3 / 7) * 1e9 # ns to sec + assert stat.min_output_token_throughput == pytest.approx(min_ott) assert stat.min_num_output_token == 3 assert stat.max_time_to_first_token == 2 assert stat.max_inter_token_latency == 3 - assert stat.max_output_token_throughput == pytest.approx(5 / 9) + max_ott = (5 / 9) * 1e9 # ns to sec + assert stat.max_output_token_throughput == pytest.approx(max_ott) assert stat.max_num_output_token == 5 assert stat.std_time_to_first_token == np.std([2, 2]) assert stat.std_inter_token_latency == np.std([2, 3, 2, 2]) - assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 9]) + ott1 = (3 / 7) * 1e9 + ott2 = (5 / 9) * 1e9 + assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2])) assert stat.std_num_output_token == np.std([3, 5]) # experiment 2 statistics - stat = pd.get_statistics(infer_mode="request_rate", load_level=2.0) + stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0") assert stat.avg_time_to_first_token == 2.5 assert stat.avg_inter_token_latency == 3 - assert stat.avg_output_token_throughput == pytest.approx(97 / 208) + avg_ott = (97 / 208) * 1e9 # ns to sec + assert stat.avg_output_token_throughput == pytest.approx(avg_ott) assert stat.avg_num_output_token == 4.5 assert stat.p50_time_to_first_token == 2.5 assert stat.p50_inter_token_latency == 2 - assert stat.p50_output_token_throughput == pytest.approx(97 / 208) + p50_ott = (97 / 208) * 1e9 # ns to sec + assert stat.p50_output_token_throughput == pytest.approx(p50_ott) assert stat.p50_num_output_token == 4.5 assert stat.min_time_to_first_token == 2 assert stat.min_inter_token_latency == 1 - assert stat.min_output_token_throughput == pytest.approx(4 / 13) + min_ott = (4 / 13) * 1e9 # ns to sec + assert stat.min_output_token_throughput == pytest.approx(min_ott) assert stat.min_num_output_token == 4 assert stat.max_time_to_first_token == 3 assert stat.max_inter_token_latency == 5 - assert stat.max_output_token_throughput == pytest.approx(5 / 8) + max_ott = (5 / 8) * 1e9 # ns to sec + assert stat.max_output_token_throughput == pytest.approx(max_ott) assert stat.max_num_output_token == 5 assert stat.std_time_to_first_token == np.std([2, 3]) assert stat.std_inter_token_latency == np.std([1, 5, 5, 2, 2]) - assert stat.std_output_token_throughput == np.std([4 / 13, 5 / 8]) + ott1 = (4 / 13) * 1e9 + ott2 = (5 / 8) * 1e9 + assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2])) assert stat.std_num_output_token == np.std([4, 5]) # check non-existing profile data with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level=30) + pd.get_statistics(infer_mode="concurrency", load_level="30") def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None: """Collect LLM metrics from profile export data and check values. @@ -280,36 +295,42 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None: ) # experiment 1 statistics - stat = pd.get_statistics(infer_mode="concurrency", load_level=10) + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") assert stat.avg_time_to_first_token == 2 assert stat.avg_inter_token_latency == 2.4 - assert stat.avg_output_token_throughput == pytest.approx(37 / 91) + avg_ott = (37 / 91) * 1e9 # ns to sec + assert stat.avg_output_token_throughput == pytest.approx(avg_ott) assert stat.avg_num_output_token == 4 assert stat.p50_time_to_first_token == 2 assert stat.p50_inter_token_latency == 2 - assert stat.p50_output_token_throughput == pytest.approx(37 / 91) + p50_ott = (37 / 91) * 1e9 # ns to sec + assert stat.p50_output_token_throughput == pytest.approx(p50_ott) assert stat.p50_num_output_token == 4 assert stat.min_time_to_first_token == 2 assert stat.min_inter_token_latency == 2 - assert stat.min_output_token_throughput == pytest.approx(5 / 13) + min_ott = (5 / 13) * 1e9 + assert stat.min_output_token_throughput == pytest.approx(min_ott) assert stat.min_num_output_token == 3 assert stat.max_time_to_first_token == 2 assert stat.max_inter_token_latency == 3 - assert stat.max_output_token_throughput == pytest.approx(3 / 7) + max_ott = (3 / 7) * 1e9 + assert stat.max_output_token_throughput == pytest.approx(max_ott) assert stat.max_num_output_token == 5 assert stat.std_time_to_first_token == np.std([2, 2]) assert stat.std_inter_token_latency == np.std([2, 3, 3, 2, 2]) - assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 13]) + ott1 = (3 / 7) * 1e9 + ott2 = (5 / 13) * 1e9 + assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2])) assert stat.std_num_output_token == np.std([3, 5]) # check non-existing profile data with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level=40) + pd.get_statistics(infer_mode="concurrency", load_level="40") def test_llm_metrics_get_base_name(self) -> None: """Test get_base_name method in LLMMetrics class."""