Skip to content

Commit

Permalink
Fix unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Mar 9, 2024
1 parent cced423 commit b913d58
Showing 1 changed file with 54 additions and 33 deletions.
87 changes: 54 additions & 33 deletions src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,22 @@ def prepare_triton_profile_data(self) -> None:
{
"timestamp": 1,
"response_timestamps": [3, 5, 8],
# LLMProfileDataParser preprocessse the responses
# from triton server and removes first few chars.
# Add whitespace to avoid valid chars being removed.
"response_outputs": [
{"text_output": "dogs"},
{"text_output": "are"},
{"text_output": "cool"},
{"text_output": " dogs"},
{"text_output": " are"},
{"text_output": " cool"},
],
},
{
"timestamp": 2,
"response_timestamps": [4, 7, 11],
"response_outputs": [
{"text_output": "I"},
{"text_output": "don't"},
{"text_output": "cook food"},
{"text_output": " I"},
{"text_output": " don't"},
{"text_output": " cook food"},
],
},
],
Expand All @@ -78,19 +81,19 @@ def prepare_triton_profile_data(self) -> None:
"timestamp": 5,
"response_timestamps": [7, 8, 13, 18],
"response_outputs": [
{"text_output": "cats"},
{"text_output": "are"},
{"text_output": "cool"},
{"text_output": "too"},
{"text_output": " cats"},
{"text_output": " are"},
{"text_output": " cool"},
{"text_output": " too"},
],
},
{
"timestamp": 3,
"response_timestamps": [6, 8, 11],
"response_outputs": [
{"text_output": "it's"},
{"text_output": "very"},
{"text_output": "simple work"},
{"text_output": " it's"},
{"text_output": " very"},
{"text_output": " simple work"},
],
},
],
Expand Down Expand Up @@ -199,64 +202,76 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
)

# experiment 1 statistics
stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.25
assert stat.avg_output_token_throughput == pytest.approx(31 / 63)
avg_ott = (31 / 63) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4

assert stat.p50_time_to_first_token == 2
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(31 / 63)
p50_ott = (31 / 63) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 2
assert stat.min_output_token_throughput == pytest.approx(3 / 7)
min_ott = (3 / 7) * 1e9 # ns to sec
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 3

assert stat.max_time_to_first_token == 2
assert stat.max_inter_token_latency == 3
assert stat.max_output_token_throughput == pytest.approx(5 / 9)
max_ott = (5 / 9) * 1e9 # ns to sec
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 2])
assert stat.std_inter_token_latency == np.std([2, 3, 2, 2])
assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 9])
ott1 = (3 / 7) * 1e9
ott2 = (5 / 9) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([3, 5])

# experiment 2 statistics
stat = pd.get_statistics(infer_mode="request_rate", load_level=2.0)
stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")

assert stat.avg_time_to_first_token == 2.5
assert stat.avg_inter_token_latency == 3
assert stat.avg_output_token_throughput == pytest.approx(97 / 208)
avg_ott = (97 / 208) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4.5

assert stat.p50_time_to_first_token == 2.5
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(97 / 208)
p50_ott = (97 / 208) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4.5

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 1
assert stat.min_output_token_throughput == pytest.approx(4 / 13)
min_ott = (4 / 13) * 1e9 # ns to sec
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 4

assert stat.max_time_to_first_token == 3
assert stat.max_inter_token_latency == 5
assert stat.max_output_token_throughput == pytest.approx(5 / 8)
max_ott = (5 / 8) * 1e9 # ns to sec
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 3])
assert stat.std_inter_token_latency == np.std([1, 5, 5, 2, 2])
assert stat.std_output_token_throughput == np.std([4 / 13, 5 / 8])
ott1 = (4 / 13) * 1e9
ott2 = (5 / 8) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([4, 5])

# check non-existing profile data
with pytest.raises(KeyError):
pd.get_statistics(infer_mode="concurrency", load_level=30)
pd.get_statistics(infer_mode="concurrency", load_level="30")

def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
"""Collect LLM metrics from profile export data and check values.
Expand All @@ -280,36 +295,42 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
)

# experiment 1 statistics
stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.4
assert stat.avg_output_token_throughput == pytest.approx(37 / 91)
avg_ott = (37 / 91) * 1e9 # ns to sec
assert stat.avg_output_token_throughput == pytest.approx(avg_ott)
assert stat.avg_num_output_token == 4

assert stat.p50_time_to_first_token == 2
assert stat.p50_inter_token_latency == 2
assert stat.p50_output_token_throughput == pytest.approx(37 / 91)
p50_ott = (37 / 91) * 1e9 # ns to sec
assert stat.p50_output_token_throughput == pytest.approx(p50_ott)
assert stat.p50_num_output_token == 4

assert stat.min_time_to_first_token == 2
assert stat.min_inter_token_latency == 2
assert stat.min_output_token_throughput == pytest.approx(5 / 13)
min_ott = (5 / 13) * 1e9
assert stat.min_output_token_throughput == pytest.approx(min_ott)
assert stat.min_num_output_token == 3

assert stat.max_time_to_first_token == 2
assert stat.max_inter_token_latency == 3
assert stat.max_output_token_throughput == pytest.approx(3 / 7)
max_ott = (3 / 7) * 1e9
assert stat.max_output_token_throughput == pytest.approx(max_ott)
assert stat.max_num_output_token == 5

assert stat.std_time_to_first_token == np.std([2, 2])
assert stat.std_inter_token_latency == np.std([2, 3, 3, 2, 2])
assert stat.std_output_token_throughput == np.std([3 / 7, 5 / 13])
ott1 = (3 / 7) * 1e9
ott2 = (5 / 13) * 1e9
assert stat.std_output_token_throughput == pytest.approx(np.std([ott1, ott2]))
assert stat.std_num_output_token == np.std([3, 5])

# check non-existing profile data
with pytest.raises(KeyError):
pd.get_statistics(infer_mode="concurrency", load_level=40)
pd.get_statistics(infer_mode="concurrency", load_level="40")

def test_llm_metrics_get_base_name(self) -> None:
"""Test get_base_name method in LLMMetrics class."""
Expand Down

0 comments on commit b913d58

Please sign in to comment.