Skip to content

Commit

Permalink
[CI/Benchmark] add more iteration and use median for robust latency b…
Browse files Browse the repository at this point in the history
…enchmark (vllm-project#3889)
  • Loading branch information
youkaichao authored Apr 6, 2024
1 parent 54951ac commit e4be7d7
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
return latency

print("Warming up...")
run_to_completion(profile_dir=None)
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion(profile_dir=None)

if args.profile:
profile_dir = args.profile_result_dir
Expand All @@ -84,7 +85,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90]
percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')


if __name__ == '__main__':
Expand All @@ -106,9 +112,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters-warmup',
type=int,
default=10,
help='Number of iterations to run for warmup.')
parser.add_argument('--num-iters',
type=int,
default=3,
default=30,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
Expand Down

0 comments on commit e4be7d7

Please sign in to comment.