update readme and test

cli99 · Jan 3, 2024 · f5acbde · f5acbde
1 parent 438cb74
commit f5acbde
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 50 deletions.
diff --git a/examples/llama2/README.md b/examples/llama2/README.md
@@ -11,52 +11,28 @@
 The Appendix of [Why GPT-3.5 is (mostly) cheaper than Llama 2](https://www.cursor.so/blog/llama-inference) reports measurements with  `Llama-2-70B` on 2 A100-80GB GPUs. The tables below show llm-analysis results using the experiment setups with different efficiency numbers.
 The cost per GPU hour uses `$2.21`. For other details, check `run_infer_cursor.py`.
 
-- `flops_efficiency=0.6 and hbm_memory_efficiency=0.6`
-
-| Batch Size | Prompt Tokens | Completion Tokens | Time to first token (s) | Time for completion (s) | Tokens/second | Price/1k prompt tokens | Price /1k Completion tokens |
-| ---------- | ------------- | ----------------- | ------------------------ | ----------------------- | ------------- | ---------------------- | --------------------------- |
-| 1          | 128           | 242               | 0.055                    | 13.185                  | 18.277        | 0.000532               | 0.06689                     |
-| 2          | 128           | 512               | 0.080                    | 28.025                  | 36.434        | 0.000386               | 0.03360                     |
-| 4          | 128           | 512               | 0.158                    | 28.209                  | 72.197        | 0.000378               | 0.01691                     |
-| 1          | 512           | 512               | 0.159                    | 27.986                  | 18.192        | 0.000381               | 0.06711                     |
-| 2          | 512           | 512               | 0.318                    | 28.130                  | 35.995        | 0.000381               | 0.03373                     |
-| 4          | 512           | 512               | 0.635                    | 28.420                  | 70.487        | 0.000381               | 0.01704                     |
-| 1          | 512           | 304               | 0.159                    | 16.600                  | 18.140        | 0.000381               | 0.06704                     |
-| 8          | 512           | 512               | 1.270                    | 28.999                  | 135.323       | 0.000381               | 0.00869                     |
-| 16         | 512           | 512               | 2.539                    | 30.156                  | 250.556       | 0.000381               | 0.00452                     |
-| 32         | 512           | 512               | 5.078                    | 32.472                  | 436.335       | 0.000380               | 0.00243                     |
-| 1          | 1024          | 512               | 0.321                    | 28.056                  | 18.043        | 0.000385               | 0.06728                     |
-| 2          | 1024          | 512               | 0.642                    | 28.271                  | 35.416        | 0.000385               | 0.03390                     |
-| 4          | 1024          | 512               | 1.284                    | 28.701                  | 68.301        | 0.000385               | 0.01721                     |
-| 8          | 1024          | 512               | 2.568                    | 29.560                  | 127.488       | 0.000385               | 0.00886                     |
-| 16         | 1024          | 512               | 5.136                    | 31.280                  | 224.957       | 0.000385               | 0.00469                     |
-| 1          | 3595          | 512               | 1.192                    | 28.408                  | 17.297        | 0.000407               | 0.06812                     |
-| 2          | 3595          | 512               | 2.384                    | 28.976                  | 32.653        | 0.000407               | 0.03474                     |
-| 4          | 3595          | 512               | 4.767                    | 30.111                  | 58.719        | 0.000407               | 0.01805                     |
-
 - `flops_efficiency=0.7 and hbm_memory_efficiency=0.9`
 
 | Batch Size | Prompt Tokens | Completion Tokens | Time to first token (s) | Time for completion (s) | Tokens/second | Price/1k prompt tokens | Price /1k Completion tokens |
-| ---------- | ------------- | ----------------- | ------------------------ | ----------------------- | ------------- | ---------------------- | --------------------------- |
-| 1          | 128           | 242               | 0.039                    | 8.894                   | 27.090        | 0.000374               | 0.04512                     |
-| 2          | 128           | 512               | 0.068                    | 18.903                  | 53.976        | 0.000326               | 0.02267                     |
-| 4          | 128           | 512               | 0.136                    | 19.026                  | 106.878       | 0.000325               | 0.01141                     |
-| 1          | 512           | 512               | 0.137                    | 18.877                  | 26.928        | 0.000328               | 0.04527                     |
-| 2          | 512           | 512               | 0.273                    | 18.974                  | 53.204        | 0.000327               | 0.02275                     |
-| 4          | 512           | 512               | 0.546                    | 19.167                  | 103.892       | 0.000327               | 0.01149                     |
-| 1          | 512           | 304               | 0.137                    | 11.197                  | 26.823        | 0.000328               | 0.04522                     |
-| 8          | 512           | 512               | 1.092                    | 19.553                  | 198.402       | 0.000327               | 0.00586                     |
-| 16         | 512           | 512               | 2.183                    | 20.326                  | 363.938       | 0.000327               | 0.00305                     |
-| 32         | 512           | 512               | 4.366                    | 21.872                  | 624.440       | 0.000327               | 0.00164                     |
-| 1          | 1024          | 512               | 0.276                    | 18.924                  | 26.666        | 0.000331               | 0.04538                     |
-| 2          | 1024          | 512               | 0.552                    | 19.067                  | 52.193        | 0.000331               | 0.02286                     |
-| 4          | 1024          | 512               | 1.104                    | 19.354                  | 100.106       | 0.000331               | 0.01160                     |
-| 8          | 1024          | 512               | 2.208                    | 19.928                  | 185.037       | 0.000331               | 0.00597                     |
-| 16         | 1024          | 512               | 4.416                    | 21.075                  | 321.363       | 0.000331               | 0.00316                     |
-| 1          | 3595          | 512               | 1.025                    | 19.159                  | 25.367        | 0.000350               | 0.04594                     |
-| 2          | 3595          | 512               | 2.049                    | 19.537                  | 47.437        | 0.000350               | 0.02343                     |
-| 4          | 3595          | 512               | 4.098                    | 20.294                  | 83.961        | 0.000350               | 0.01217                     |
-
+| ---------- | ------------- | ----------------- | ----------------------- | ----------------------- | ------------- | ---------------------- | --------------------------- |
+| 1          | 128           | 242               | 0.040                   | 9.425                   | 25.568        | 0.000383               | 0.0478                      |
+| 2          | 128           | 512               | 0.062                   | 20.026                  | 50.977        | 0.000296               | 0.0240                      |
+| 4          | 128           | 512               | 0.123                   | 20.148                  | 101.031       | 0.000295               | 0.0121                      |
+| 1          | 512           | 512               | 0.124                   | 20.000                  | 25.442        | 0.000298               | 0.0480                      |
+| 2          | 512           | 512               | 0.248                   | 20.096                  | 50.334        | 0.000297               | 0.0241                      |
+| 4          | 512           | 512               | 0.496                   | 20.288                  | 98.537        | 0.000297               | 0.0122                      |
+| 1          | 512           | 304               | 0.124                   | 11.864                  | 25.359        | 0.000298               | 0.0479                      |
+| 8          | 512           | 512               | 0.991                   | 20.673                  | 189.069       | 0.000297               | 0.0062                      |
+| 16         | 512           | 512               | 1.982                   | 21.442                  | 349.726       | 0.000297               | 0.0032                      |
+| 32         | 512           | 512               | 3.963                   | 22.981                  | 608.077       | 0.000297               | 0.0017                      |
+| 1          | 1024          | 512               | 0.251                   | 20.047                  | 25.224        | 0.000301               | 0.0481                      |
+| 2          | 1024          | 512               | 0.502                   | 20.190                  | 49.488        | 0.000301               | 0.0242                      |
+| 4          | 1024          | 512               | 1.004                   | 20.476                  | 95.348        | 0.000301               | 0.0123                      |
+| 8          | 1024          | 512               | 2.007                   | 21.048                  | 177.666       | 0.000301               | 0.0063                      |
+| 16         | 1024          | 512               | 4.014                   | 22.191                  | 312.615       | 0.000301               | 0.0033                      |
+| 1          | 3595          | 512               | 0.936                   | 20.282                  | 24.130        | 0.000320               | 0.0486                      |
+| 2          | 3595          | 512               | 1.872                   | 20.660                  | 45.446        | 0.000320               | 0.0248                      |
+| 4          | 3595          | 512               | 3.745                   | 21.416                  | 81.398        | 0.000320               | 0.0128                      |
 
 ## References
 

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -66,17 +66,14 @@ def test_llama2_70b():
     dtype_config = get_dtype_config_by_name(dtype_name)
     parallel_config = ParallelismConfig(tp_size=tp_size)
 
-    analysis = LLMAnalysis(
-        model_config,
-        gpu_config,
-        dtype_config,
-        parallel_config,
-    )
+    analysis = LLMAnalysis(model_config, gpu_config, dtype_config,
+                           parallel_config)
 
     summary_dict = analysis.inference(
         batch_size_per_gpu=batch_size_per_gpu,
         seq_len=512,
         num_tokens_to_generate=512,
     )
 
-    assert within_range(summary_dict["total_decode_latency"], 14.79, TOLERANCE)
+    assert within_range(summary_dict["total_decode_latency"], 180.06,
+                        TOLERANCE)