Add Llama3 configuration

mlcommons · Nov 22, 2024 · e44c62a · e44c62a
1 parent beaa790
commit e44c62a
Showing 1 changed file with 10 additions and 2 deletions.
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
@@ -14,6 +14,7 @@ dlrm-v2.*.performance_sample_count_override = 204800
 rnnt.*.performance_sample_count_override = 2513
 gptj.*.performance_sample_count_override = 13368
 llama2-70b.*.performance_sample_count_override = 24576
+llama3-405b.*.performance_sample_count_override = 8312
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -44,6 +45,7 @@ retinanet.MultiStream.target_latency = 528
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
 mixtral-8x7b.*.sample_concatenate_permutation = 1
+llama3-405b.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
@@ -57,13 +59,14 @@ dlrm-v2.Server.target_latency = 60
 rnnt.Server.target_latency = 1000
 gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
-# Llama2-70b benchmarks measures token latencies
+# Benchmarks that measure token latencies
 llama2-70b.*.use_token_latencies = 1
 mixtral-8x7b.*.use_token_latencies = 1
+llama3-405b.*.use_token_latencies = 1
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
-# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
+# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3-405b benchmark therefore target_latency = 0
 llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
@@ -72,6 +75,10 @@ mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200
 
+llama3-405b.Server.target_latency = 0
+llama3-405b.Server.ttft_latency = 6000
+llama3-405b.Server.tpot_latency = 175
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -89,6 +96,7 @@ rnnt.Offline.min_query_count = 2513
 3d-unet.Offline.min_query_count = 43
 stable-diffusion-xl.Offline.min_query_count = 5000
 llama2-70b.Offline.min_query_count = 24576
+llama3-405b.Offline.min_query_count = 8312
 mixtral-8x7b.Offline.min_query_count = 15000
 
 # These fields should be defined and overridden by user.conf.