diff --git a/examples/llama2/README.md b/examples/llama2/README.md index ae28ff8..5ac5a01 100644 --- a/examples/llama2/README.md +++ b/examples/llama2/README.md @@ -11,52 +11,28 @@ The Appendix of [Why GPT-3.5 is (mostly) cheaper than Llama 2](https://www.cursor.so/blog/llama-inference) reports measurements with `Llama-2-70B` on 2 A100-80GB GPUs. The tables below show llm-analysis results using the experiment setups with different efficiency numbers. The cost per GPU hour uses `$2.21`. For other details, check `run_infer_cursor.py`. -- `flops_efficiency=0.6 and hbm_memory_efficiency=0.6` - -| Batch Size | Prompt Tokens | Completion Tokens | Time to first token (s) | Time for completion (s) | Tokens/second | Price/1k prompt tokens | Price /1k Completion tokens | -| ---------- | ------------- | ----------------- | ------------------------ | ----------------------- | ------------- | ---------------------- | --------------------------- | -| 1 | 128 | 242 | 0.055 | 13.185 | 18.277 | 0.000532 | 0.06689 | -| 2 | 128 | 512 | 0.080 | 28.025 | 36.434 | 0.000386 | 0.03360 | -| 4 | 128 | 512 | 0.158 | 28.209 | 72.197 | 0.000378 | 0.01691 | -| 1 | 512 | 512 | 0.159 | 27.986 | 18.192 | 0.000381 | 0.06711 | -| 2 | 512 | 512 | 0.318 | 28.130 | 35.995 | 0.000381 | 0.03373 | -| 4 | 512 | 512 | 0.635 | 28.420 | 70.487 | 0.000381 | 0.01704 | -| 1 | 512 | 304 | 0.159 | 16.600 | 18.140 | 0.000381 | 0.06704 | -| 8 | 512 | 512 | 1.270 | 28.999 | 135.323 | 0.000381 | 0.00869 | -| 16 | 512 | 512 | 2.539 | 30.156 | 250.556 | 0.000381 | 0.00452 | -| 32 | 512 | 512 | 5.078 | 32.472 | 436.335 | 0.000380 | 0.00243 | -| 1 | 1024 | 512 | 0.321 | 28.056 | 18.043 | 0.000385 | 0.06728 | -| 2 | 1024 | 512 | 0.642 | 28.271 | 35.416 | 0.000385 | 0.03390 | -| 4 | 1024 | 512 | 1.284 | 28.701 | 68.301 | 0.000385 | 0.01721 | -| 8 | 1024 | 512 | 2.568 | 29.560 | 127.488 | 0.000385 | 0.00886 | -| 16 | 1024 | 512 | 5.136 | 31.280 | 224.957 | 0.000385 | 0.00469 | -| 1 | 3595 | 512 | 1.192 | 28.408 | 17.297 | 0.000407 | 0.06812 | -| 2 | 3595 | 512 | 2.384 | 28.976 | 32.653 | 0.000407 | 0.03474 | -| 4 | 3595 | 512 | 4.767 | 30.111 | 58.719 | 0.000407 | 0.01805 | - - `flops_efficiency=0.7 and hbm_memory_efficiency=0.9` | Batch Size | Prompt Tokens | Completion Tokens | Time to first token (s) | Time for completion (s) | Tokens/second | Price/1k prompt tokens | Price /1k Completion tokens | -| ---------- | ------------- | ----------------- | ------------------------ | ----------------------- | ------------- | ---------------------- | --------------------------- | -| 1 | 128 | 242 | 0.039 | 8.894 | 27.090 | 0.000374 | 0.04512 | -| 2 | 128 | 512 | 0.068 | 18.903 | 53.976 | 0.000326 | 0.02267 | -| 4 | 128 | 512 | 0.136 | 19.026 | 106.878 | 0.000325 | 0.01141 | -| 1 | 512 | 512 | 0.137 | 18.877 | 26.928 | 0.000328 | 0.04527 | -| 2 | 512 | 512 | 0.273 | 18.974 | 53.204 | 0.000327 | 0.02275 | -| 4 | 512 | 512 | 0.546 | 19.167 | 103.892 | 0.000327 | 0.01149 | -| 1 | 512 | 304 | 0.137 | 11.197 | 26.823 | 0.000328 | 0.04522 | -| 8 | 512 | 512 | 1.092 | 19.553 | 198.402 | 0.000327 | 0.00586 | -| 16 | 512 | 512 | 2.183 | 20.326 | 363.938 | 0.000327 | 0.00305 | -| 32 | 512 | 512 | 4.366 | 21.872 | 624.440 | 0.000327 | 0.00164 | -| 1 | 1024 | 512 | 0.276 | 18.924 | 26.666 | 0.000331 | 0.04538 | -| 2 | 1024 | 512 | 0.552 | 19.067 | 52.193 | 0.000331 | 0.02286 | -| 4 | 1024 | 512 | 1.104 | 19.354 | 100.106 | 0.000331 | 0.01160 | -| 8 | 1024 | 512 | 2.208 | 19.928 | 185.037 | 0.000331 | 0.00597 | -| 16 | 1024 | 512 | 4.416 | 21.075 | 321.363 | 0.000331 | 0.00316 | -| 1 | 3595 | 512 | 1.025 | 19.159 | 25.367 | 0.000350 | 0.04594 | -| 2 | 3595 | 512 | 2.049 | 19.537 | 47.437 | 0.000350 | 0.02343 | -| 4 | 3595 | 512 | 4.098 | 20.294 | 83.961 | 0.000350 | 0.01217 | - +| ---------- | ------------- | ----------------- | ----------------------- | ----------------------- | ------------- | ---------------------- | --------------------------- | +| 1 | 128 | 242 | 0.040 | 9.425 | 25.568 | 0.000383 | 0.0478 | +| 2 | 128 | 512 | 0.062 | 20.026 | 50.977 | 0.000296 | 0.0240 | +| 4 | 128 | 512 | 0.123 | 20.148 | 101.031 | 0.000295 | 0.0121 | +| 1 | 512 | 512 | 0.124 | 20.000 | 25.442 | 0.000298 | 0.0480 | +| 2 | 512 | 512 | 0.248 | 20.096 | 50.334 | 0.000297 | 0.0241 | +| 4 | 512 | 512 | 0.496 | 20.288 | 98.537 | 0.000297 | 0.0122 | +| 1 | 512 | 304 | 0.124 | 11.864 | 25.359 | 0.000298 | 0.0479 | +| 8 | 512 | 512 | 0.991 | 20.673 | 189.069 | 0.000297 | 0.0062 | +| 16 | 512 | 512 | 1.982 | 21.442 | 349.726 | 0.000297 | 0.0032 | +| 32 | 512 | 512 | 3.963 | 22.981 | 608.077 | 0.000297 | 0.0017 | +| 1 | 1024 | 512 | 0.251 | 20.047 | 25.224 | 0.000301 | 0.0481 | +| 2 | 1024 | 512 | 0.502 | 20.190 | 49.488 | 0.000301 | 0.0242 | +| 4 | 1024 | 512 | 1.004 | 20.476 | 95.348 | 0.000301 | 0.0123 | +| 8 | 1024 | 512 | 2.007 | 21.048 | 177.666 | 0.000301 | 0.0063 | +| 16 | 1024 | 512 | 4.014 | 22.191 | 312.615 | 0.000301 | 0.0033 | +| 1 | 3595 | 512 | 0.936 | 20.282 | 24.130 | 0.000320 | 0.0486 | +| 2 | 3595 | 512 | 1.872 | 20.660 | 45.446 | 0.000320 | 0.0248 | +| 4 | 3595 | 512 | 3.745 | 21.416 | 81.398 | 0.000320 | 0.0128 | ## References diff --git a/tests/test_inference.py b/tests/test_inference.py index ef14fbc..e3445d8 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -66,12 +66,8 @@ def test_llama2_70b(): dtype_config = get_dtype_config_by_name(dtype_name) parallel_config = ParallelismConfig(tp_size=tp_size) - analysis = LLMAnalysis( - model_config, - gpu_config, - dtype_config, - parallel_config, - ) + analysis = LLMAnalysis(model_config, gpu_config, dtype_config, + parallel_config) summary_dict = analysis.inference( batch_size_per_gpu=batch_size_per_gpu, @@ -79,4 +75,5 @@ def test_llama2_70b(): num_tokens_to_generate=512, ) - assert within_range(summary_dict["total_decode_latency"], 14.79, TOLERANCE) + assert within_range(summary_dict["total_decode_latency"], 180.06, + TOLERANCE)