Skip to content

Commit

Permalink
Fix failing test_llm_metrics.py (#509)
Browse files Browse the repository at this point in the history
* Fix unit tests

* Add FIXME
  • Loading branch information
nv-hwoo authored and mc-nv committed Mar 13, 2024
1 parent fae32bf commit 36b62a8
Showing 1 changed file with 50 additions and 28 deletions.
78 changes: 50 additions & 28 deletions src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,23 @@ def prepare_triton_profile_data(self) -> None:
{
"timestamp": 1,
"response_timestamps": [3, 5, 8],
# FIXME - remove the whitespace once PA handles it.
# LLMProfileDataParser preprocessse the responses
# from triton server and removes first few chars.
# Add whitespace to avoid valid chars being removed.
"response_outputs": [
{"text_output": "dogs"},
{"text_output": "are"},
{"text_output": "cool"},
{"text_output": " dogs"},
{"text_output": " are"},
{"text_output": " cool"},
],
},
{
"timestamp": 2,
"response_timestamps": [4, 7, 11],
"response_outputs": [
{"text_output": "I"},
{"text_output": "don't"},
{"text_output": "cook food"},
{"text_output": " I"},
{"text_output": " don't"},
{"text_output": " cook food"},
],
},
],
Expand All @@ -78,19 +82,19 @@ def prepare_triton_profile_data(self) -> None:
"timestamp": 5,
"response_timestamps": [7, 8, 13, 18],
"response_outputs": [
{"text_output": "cats"},
{"text_output": "are"},
{"text_output": "cool"},
{"text_output": "too"},
{"text_output": " cats"},
{"text_output": " are"},
{"text_output": " cool"},
{"text_output": " too"},
],
},
{
"timestamp": 3,
"response_timestamps": [6, 8, 11],
"response_outputs": [
{"text_output": "it's"},
{"text_output": "very"},
{"text_output": "simple work"},
{"text_output": " it's"},
{"text_output": " very"},
{"text_output": " simple work"},
],
},
],
Expand Down Expand Up @@ -203,55 +207,67 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.25
assert stat.avg_output_token_throughput == pytest.approx(31 / 63)
avg_ott = (31 / 63) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4

assert stat.p50_time_to_first_token == 2
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(31 / 63)
p50_ott = (31 / 63) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 2
assert stat.min_output_token_throughput == pytest.approx(3 / 7)
min_ott = (3 / 7) * 1e9 # ns to sec
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 3

assert stat.max_time_to_first_token == 2
assert stat.max_inter_token_latency == 3
assert stat.max_output_token_throughput == pytest.approx(5 / 9)
max_ott = (5 / 9) * 1e9 # ns to sec
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 2])
assert stat.std_inter_token_latency == np.std([2, 3, 2, 2])
assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 9])
ott1 = (3 / 7) * 1e9
ott2 = (5 / 9) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([3, 5])

# experiment 2 statistics
stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")

assert stat.avg_time_to_first_token == 2.5
assert stat.avg_inter_token_latency == 3
assert stat.avg_output_token_throughput == pytest.approx(97 / 208)
avg_ott = (97 / 208) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4.5

assert stat.p50_time_to_first_token == 2.5
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(97 / 208)
p50_ott = (97 / 208) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4.5

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 1
assert stat.min_output_token_throughput == pytest.approx(4 / 13)
min_ott = (4 / 13) * 1e9 # ns to sec
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 4

assert stat.max_time_to_first_token == 3
assert stat.max_inter_token_latency == 5
assert stat.max_output_token_throughput == pytest.approx(5 / 8)
max_ott = (5 / 8) * 1e9 # ns to sec
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 3])
assert stat.std_inter_token_latency == np.std([1, 5, 5, 2, 2])
assert stat.std_output_token_throughput == np.std([4 / 13, 5 / 8])
ott1 = (4 / 13) * 1e9
ott2 = (5 / 8) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([4, 5])

# check non-existing profile data
Expand Down Expand Up @@ -284,27 +300,33 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.4
assert stat.avg_output_token_throughput == pytest.approx(37 / 91)
avg_ott = (37 / 91) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4

assert stat.p50_time_to_first_token == 2
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(37 / 91)
p50_ott = (37 / 91) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 2
assert stat.min_output_token_throughput == pytest.approx(5 / 13)
min_ott = (5 / 13) * 1e9
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 3

assert stat.max_time_to_first_token == 2
assert stat.max_inter_token_latency == 3
assert stat.max_output_token_throughput == pytest.approx(3 / 7)
max_ott = (3 / 7) * 1e9
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 2])
assert stat.std_inter_token_latency == np.std([2, 3, 3, 2, 2])
assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 13])
ott1 = (3 / 7) * 1e9
ott2 = (5 / 13) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([3, 5])

# check non-existing profile data
Expand Down

0 comments on commit 36b62a8

Please sign in to comment.