From b3ec3b4e805ec92397bac8c3153ab6fe1297094e Mon Sep 17 00:00:00 2001
From: F-G Fernandez <26927750+frgfm@users.noreply.github.com>
Date: Tue, 19 Mar 2024 19:17:45 +0100
Subject: [PATCH] feat(scripts): clean LLM latency benchmark (#132)

* refactor(scripts): update latency script

* docs(readme): update script instructions

* docs(latency): update latency benchmark

* feat(scripts): add docker for latency bench

* docs(scripts): add a README

* docs(readme): update readme
---
 README.md                                     |  23 +---
 scripts/latency.csv                           |   4 -
 scripts/ollama/Dockerfile                     |  17 +++
 scripts/ollama/README.md                      |  65 +++++++++++
 scripts/ollama/docker-compose.yml             |  37 ++++++
 .../evaluate_latency.py}                      | 106 +++++++++---------
 scripts/ollama/latency.csv                    |  22 ++++
 7 files changed, 193 insertions(+), 81 deletions(-)
 delete mode 100644 scripts/latency.csv
 create mode 100644 scripts/ollama/Dockerfile
 create mode 100644 scripts/ollama/README.md
 create mode 100644 scripts/ollama/docker-compose.yml
 rename scripts/{evaluate_ollama_latency.py => ollama/evaluate_latency.py} (51%)
 create mode 100644 scripts/ollama/latency.csv

diff --git a/README.md b/README.md
index ce7ff14..4279a99 100644
--- a/README.md
+++ b/README.md
@@ -52,33 +52,14 @@ Quack is the AI coding companion that helps software teams ship faster. See it a
 
 The backend API is the gatekeeper for your LLM inference container (powered by our friend at [Ollama](https://github.com/ollama/ollama)). With your services up and running, you can use the code chat endpoint as coding-specific LLM chat.
 
+*Check our [LLM latency benchmark](scripts/ollama) on a few cloud providers if you want to run this in the cloud.*
+
 ### REST API for guideline management & LLM inference
 
 With the service running, you can navigate to [`http://localhost:8050/docs`](http://localhost:8050/docs) to interact with the API (or do it through HTTP requests) and explore the documentation.
 
 ![API Swagger screenshot](https://github.com/quack-ai/contribution-api/assets/26927750/725e8308-ace1-40ed-b742-242f8186fec0)
 
-### Latency benchmark
-
-You crave for perfect codde suggestions, but you don't know whether it fits your needs in terms of latency?
-In the table below, you will find a latency benchmark for all tested LLMs from Ollama:
-
-| Model                                                        | Ingestion mean (std)   | Generation mean (std) |
-| ------------------------------------------------------------ | ---------------------- | --------------------- |
-| [tinyllama:1.1b-chat-v1-q4_0](https://ollama.com/library/tinyllama:1.1b-chat-v1-q4_0) | 2014.63 tok/s (±12.62) | 227.13 tok/s (±2.26)  |
-| [dolphin-phi:2.7b-v2.6-q4_0](https://ollama.com/library/dolphin-phi:2.7b-v2.6-q4_0) | 684.07 tok/s (±3.85)   | 122.25 toks/s (±0.87) |
-| [dolphin-mistral:7b-v2.6](https://ollama.com/library/dolphin-mistral:7b-v2.6) | 291.94 tok/s (±0.4)    | 60.56 tok/s (±0.15)   |
-
-
-This benchmark was performed over 20 iterations on the same input sequence, on a **laptop** to better reflect performances that can be expected by common users. The hardware setup includes an [Intel(R) Core(TM) i7-12700H](https://ark.intel.com/content/www/us/en/ark/products/132228/intel-core-i7-12700h-processor-24m-cache-up-to-4-70-ghz.html) for the CPU, and a [NVIDIA GeForce RTX 3060](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3060-3060ti/) for the laptop GPU.
-
-You can run this latency benchmark for any Ollama model on your hardware as follows:
-```bash
-python scripts/evaluate_ollama_latency.py dolphin-mistral:7b-v2.6-dpo-laser-q4_0 --endpoint http://localhost:3000
-```
-
-*All script arguments can be checked using `python scripts/evaluate_ollama_latency.py --help`*
-
 
 ## Get started 🚀
 
diff --git a/scripts/latency.csv b/scripts/latency.csv
deleted file mode 100644
index 7332694..0000000
--- a/scripts/latency.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-model,hardware,ingestion_mean (tok/s),ingestion_std (tok/s),generation_mean (tok/s),generation_std (tok/s)
-dolphin-mistral:7b-v2.6,NVIDIA RTX 3060 (laptop),291.94,0.4,60.56,0.15
-dolphin-phi:2.7b-v2.6-q4_0,NVIDIA RTX 3060 (laptop),684.07,3.85,122.25,0.87
-tinyllama:1.1b-chat-v1-q4_0,NVIDIA RTX 3060 (laptop),2014.63,12.62,227.13,2.26
diff --git a/scripts/ollama/Dockerfile b/scripts/ollama/Dockerfile
new file mode 100644
index 0000000..da09260
--- /dev/null
+++ b/scripts/ollama/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.11-alpine3.19
+
+WORKDIR /app
+
+# set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+ENV PYTHONPATH "${PYTHONPATH}:/app"
+
+# install dependencies
+RUN set -eux \
+    && pip install --no-cache-dir uv \
+    && uv pip install --no-cache --system requests==2.31.0 tqdm==4.66.2 numpy==1.26.4 \
+    && rm -rf /root/.cache
+
+# copy script
+COPY ./evaluate_latency.py /app/evaluate.py
diff --git a/scripts/ollama/README.md b/scripts/ollama/README.md
new file mode 100644
index 0000000..f01bd9b
--- /dev/null
+++ b/scripts/ollama/README.md
@@ -0,0 +1,65 @@
+# LLM throughput benchmark
+
+## The benchmark
+
+You crave for perfect code suggestions, but you don't know whether it fits your needs in terms of latency?
+
+We ran our tests on the following hardware:
+
+- [NVIDIA GeForce RTX 3060](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3060-3060ti/) (mobile)*
+- [NVIDIA GeForce RTX 3070](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3070-3070ti/) ([Scaleway GPU-3070-S](https://www.scaleway.com/en/pricing/?tags=compute))
+- [NVIDIA A10](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([Lambda Cloud gpu_1x_a10](https://lambdalabs.com/service/gpu-cloud#pricing))
+- [NVIDIA A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([AWS g5.xlarge](https://aws.amazon.com/ec2/instance-types/g5/))
+
+*The laptop hardware setup includes an [Intel(R) Core(TM) i7-12700H](https://ark.intel.com/content/www/us/en/ark/products/132228/intel-core-i7-12700h-processor-24m-cache-up-to-4-70-ghz.html) for the CPU*
+
+with the following LLMs (cf. [Ollama hub](https://ollama.com/library)):
+- Deepseek Coder 6.7b - instruct ([Ollama](https://ollama.com/library/deepseek-coder), [HuggingFace](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct))
+- OpenCodeInterpreter 6.7b ([Ollama](https://ollama.com/pxlksr/opencodeinterpreter-ds), [HuggingFace](https://huggingface.co/m-a-p/OpenCodeInterpreter-DS-6.7B), [paper](https://arxiv.org/abs/2402.14658))
+- Dolphin Mistral 7b ([Ollama](https://ollama.com/library/dolphin-mistral), [HuggingFace](https://huggingface.co/cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser), [paper](https://arxiv.org/abs/2310.06825))
+- Coming soon: StarChat v2 ([HuggingFace](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1), [paper](https://arxiv.org/abs/2402.19173))
+
+and the following quantization formats: q3_K_M, q4_K_M, q5_K_M.
+
+This [benchmark](latency.csv) was performed over 5 iterations on 4 different sequences, including on a **laptop** to better reflect performances that can be expected by common users.
+
+## Run it on your hardware
+
+### Local setup
+
+Quite simply, start the docker:
+```
+docker compose up -d --wait
+```
+Pull the model you want
+```
+docker compose exec -T ollama ollama pull MODEL
+```
+
+And run the evaluation
+```
+docker compose exec -T evaluator python scripts/ollama/evaluate_latency.py MODEL
+```
+
+### Remote instance
+
+Start the evaluator only
+```
+docker compose up -d evaluator --wait
+```
+And run the evaluation by targeting your remote instance:
+```
+docker compose exec -T evaluator python scripts/ollama/evaluate_latency.py MODEL --endpoint http://HOST:PORT
+```
+
+*All script arguments can be checked using `python scripts/ollama/evaluate_latency.py --help`*
+
+### Others
+
+Here are the results for other LLMs that have have only been evaluated on the laptop GPU:
+
+| Model                                                        | Ingestion mean (std)   | Generation mean (std) |
+| ------------------------------------------------------------ | ---------------------- | --------------------- |
+| [tinyllama:1.1b-chat-v1-q4_0](https://ollama.com/library/tinyllama:1.1b-chat-v1-q4_0) | 2014.63 tok/s (±12.62) | 227.13 tok/s (±2.26)  |
+| [dolphin-phi:2.7b-v2.6-q4_0](https://ollama.com/library/dolphin-phi:2.7b-v2.6-q4_0) | 684.07 tok/s (±3.85)   | 122.25 toks/s (±0.87) |
+| [dolphin-mistral:7b-v2.6](https://ollama.com/library/dolphin-mistral:7b-v2.6) | 291.94 tok/s (±0.4)    | 60.56 tok/s (±0.15)   |
diff --git a/scripts/ollama/docker-compose.yml b/scripts/ollama/docker-compose.yml
new file mode 100644
index 0000000..3d220a9
--- /dev/null
+++ b/scripts/ollama/docker-compose.yml
@@ -0,0 +1,37 @@
+version: '3.7'
+
+services:
+  ollama:
+    image: ollama/ollama:0.1.29
+    ports:
+      - "11434:11434"
+    volumes:
+      - "$HOME/.ollama:/root/.ollama"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    command: serve
+    healthcheck:
+      test: ["CMD-SHELL", "ollama --help"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+
+  evaluator:
+    image: quackai/evaluator:latest
+    build: .
+    depends_on:
+      ollama:
+        condition: service_healthy
+    environment:
+      - OLLAMA_ENDPOINT=http://ollama:11434
+    volumes:
+      - ./evaluate_latency.py:/app/evaluate.py
+    command: sleep infinity
+
+volumes:
+  ollama:
diff --git a/scripts/evaluate_ollama_latency.py b/scripts/ollama/evaluate_latency.py
similarity index 51%
rename from scripts/evaluate_ollama_latency.py
rename to scripts/ollama/evaluate_latency.py
index b4627e2..3a0d18a 100644
--- a/scripts/evaluate_ollama_latency.py
+++ b/scripts/ollama/evaluate_latency.py
@@ -3,13 +3,17 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0> for full license details.
 
-import time
+import logging
+import os
+from datetime import datetime, timezone
 from typing import Any, Dict
 
 import numpy as np
 import requests
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)
+
 
 def _generate(
     endpoint: str, model: str, system: str, prompt: str, temperature: float = 0.0, timeout: int = 60
@@ -20,6 +24,7 @@ def _generate(
             "model": model,
             "stream": False,
             "options": {"temperature": temperature},
+            "keep_alive": "1s",
             "system": system,
             "prompt": prompt,
         },
@@ -36,6 +41,7 @@ def _chat_completion(
             "model": model,
             "stream": False,
             "options": {"temperature": temperature},
+            "keep_alive": "1s",
             "messages": [{"role": "system", "content": system}, {"role": "user", "content": prompt}],
         },
         timeout=timeout,
@@ -43,7 +49,7 @@ def _chat_completion(
 
 
 def _format_response(response, system, prompt) -> Dict[str, Any]:
-    assert response.status_code == 200
+    assert response.status_code == 200, print(response.__dict__)
     json_response = response.json()
     return {
         "duration": {
@@ -60,71 +66,57 @@ def _format_response(response, system, prompt) -> Dict[str, Any]:
     }
 
 
+SYSTEM_PROMPT = (
+    "You are a helpful assistant, you will be given a coding task. Answer correctly, otherwise someone will die."
+)
+USER_PROMPTS = (
+    "Write a Python function to compute the n-th fibonacci number",
+    "What's the difference between a Promise and an observable in Javascript",
+    "How are you?",
+    "Tell me about LLMs",
+)
+
+
 def main(args):
     print(args)
 
     # Healthcheck on endpoint & model
     assert requests.get(f"{args.endpoint}/api/tags", timeout=2).status_code == 200
-    response = requests.post(f"{args.endpoint}/api/pull", json={"name": args.model, "stream": False}, timeout=10)
-    assert response.status_code == 200
-    assert response.json()["status"] == "success"
+    assert requests.post(f"{args.endpoint}/api/show", json={"name": args.model}, timeout=2).status_code == 200
 
-    # Speed
-    speed_system = (
-        "You are a helpful assistant, you will be given a coding task. Answer correctly, otherwise someone will die."
-    )
-    speed_prompt = "Write a Python function to compute the n-th fibonacci number"
     # Warmup
-    for _ in range(args.warmup):
-        _generate(args.endpoint, args.model, speed_system, speed_prompt)
-
-    # Run
-    timings = []
-    input_chars, output_chars = [], []
-    input_tokens, output_tokens = [], []
-    load_duration, input_duration, output_duration, total_duration = [], [], [], []
-    for _ in tqdm(range(args.it)):
-        start_ts = time.perf_counter()
-        response = _generate(args.endpoint, args.model, speed_system, speed_prompt)
-        timings.append(time.perf_counter() - start_ts)
-        inference = _format_response(response, speed_system, speed_prompt)
-        input_chars.append(inference["chars"]["input"])
-        output_chars.append(inference["chars"]["output"])
+    _chat_completion(args.endpoint, args.model, SYSTEM_PROMPT, "Hello")
+    # Evaluation run
+    input_tokens, output_tokens, input_duration, output_duration, load_duration = [], [], [], [], []
+    for user_prompt in tqdm(USER_PROMPTS * args.it):
+        response = _chat_completion(args.endpoint, args.model, SYSTEM_PROMPT, user_prompt)
+        inference = _format_response(response, SYSTEM_PROMPT, user_prompt)
         input_tokens.append(inference["tokens"]["input"])
         output_tokens.append(inference["tokens"]["output"])
-        load_duration.append(inference["duration"]["model"])
         input_duration.append(inference["duration"]["input"])
         output_duration.append(inference["duration"]["output"])
-        total_duration.append(inference["duration"]["total"])
-
-    print(f"{args.model} ({args.it} runs)")
-    timings = np.array(timings)
-    load_duration = np.array(load_duration, dtype=int)
+        load_duration.append(inference["duration"]["model"])
+    # Aggregate information
     input_duration = np.array(input_duration, dtype=int)
     output_duration = np.array(output_duration, dtype=int)
-    total_duration = np.array(total_duration, dtype=int)
-    print(f"Model load duration: mean {load_duration.mean() / 1e6:.2f}ms, std {load_duration.std() / 1e6:.2f}ms")
+    load_duration = np.array(load_duration, dtype=int)
     # Tokens (np.float64 to handle NaNs)
     input_tokens = np.array(input_tokens, dtype=np.float64)
     output_tokens = np.array(output_tokens, dtype=np.float64)
-    input_chars = np.array(input_chars, dtype=np.float64)
-    output_chars = np.array(output_chars, dtype=np.float64)
-    print(
-        f"Input processing: mean {1e9 * input_tokens.sum() / input_duration.sum():.2f} tok/s, std {1e9 * (input_tokens / input_duration).std():.2f} tok/s"
-    )
-    print(
-        f"Output generation: mean {1e9 * output_tokens.sum() / output_duration.sum():.2f} tok/s, std {1e9 * (output_tokens / output_duration).std():.2f} tok/s"
-    )
+    result = {
+        "created_at": str(datetime.now(timezone.utc)),
+        "ingestion_mean": 1e9 * input_tokens.sum() / input_duration.sum(),
+        "ingestion_std": 1e9 * (input_tokens / input_duration).std(),
+        "generation_mean": 1e9 * output_tokens.sum() / output_duration.sum(),
+        "generation_std": 1e9 * (output_tokens / output_duration).std(),
+        "load_mean": load_duration.mean() / 1e6,
+        "load_std": load_duration.std() / 1e6,
+    }
 
-    # Chars
-    print(
-        f"Input processing: mean {1e9 * input_chars.sum() / input_duration.sum():.2f} char/s, std {1e9 * (input_chars / input_duration).std():.2f} char/s"
-    )
-    print(
-        f"Output generation: mean {1e9 * output_chars.sum() / output_duration.sum():.2f} char/s, std {1e9 * (output_chars / output_duration).std():.2f} char/s"
-    )
-    print(f"Overall latency (ollama): mean {total_duration.mean() / 1e6:.2f}ms, std {total_duration.std() / 1e6:.2f}ms")
-    print(f"Overall latency (HTTP): mean {1000 * timings.mean():.2f}ms, std {1000 * timings.std():.2f}ms")
+    print(f"{args.model} ({args.it} runs) at {result['created_at']}")
+    print(f"Model load duration: mean {result['load_mean']:.2f}ms, std {result['load_std']:.2f}ms")
+    print(f"Ingestion: mean {result['ingestion_mean']:.2f} tok/s, std {result['ingestion_std']:.2f} tok/s")
+    print(f"Generation: mean {result['generation_mean']:.2f} tok/s, std {result['generation_std']:.2f} tok/s")
 
 
 def get_parser():
@@ -135,18 +127,20 @@ def get_parser():
     )
 
     # Data & model
-    group = parser.add_argument_group("Data & model")
-    group.add_argument("model", type=str, help="model to use")
-    group.add_argument("--endpoint", default="http://localhost:11434/api", type=str, help="Ollama endpoint")
+    group = parser.add_argument_group("LLM setup")
+    group.add_argument("model", type=str, help="model to evaluate")
+    group.add_argument(
+        "--endpoint", default=os.getenv("OLLAMA_ENDPOINT", "http://ollama:11434"), type=str, help="Ollama endpoint"
+    )
 
     # Inference params
     group = parser.add_argument_group("Inference params")
     group.add_argument("--temperature", default=0, type=float, help="Temperature to use for model inference")
 
-    # Inference params
+    # # Inference params
     group = parser.add_argument_group("Evaluation")
-    group.add_argument("--it", type=int, default=20, help="Number of iterations to run")
-    group.add_argument("--warmup", type=int, default=5, help="Number of iterations for warmup")
+    group.add_argument("--it", type=int, default=5, help="Number of iterations to run")
+    # group.add_argument("--warmup", type=int, default=3, help="Number of iterations for warmup")
 
     return parser
 
diff --git a/scripts/ollama/latency.csv b/scripts/ollama/latency.csv
new file mode 100644
index 0000000..4ffdf00
--- /dev/null
+++ b/scripts/ollama/latency.csv
@@ -0,0 +1,22 @@
+model,hardware,ingestion_mean (tok/s),ingestion_std (tok/s),generation_mean (tok/s),generation_std (tok/s)
+deepseek-coder:6.7b-instruct-q5_K_M,NVIDIA RTX 3060 (laptop),35.43,3.46,23.68,0.74
+deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA RTX 3060 (laptop),72.27,10.69,36.82,1.25
+deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA RTX 3060 (laptop),90.1,32.43,50.34,1.28
+pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA RTX 3060 (laptop),78.94,10.2,37.95,1.65
+dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA RTX 3060 (laptop),126.75,31.5,50.05,0.84
+dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA RTX 3060 (laptop),89.47,29.91,47.09,0.67
+deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),266.98,95.63,75.53,1.56
+deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),141.43,50.4,73.69,1.61
+pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),285.81,73.55,75.14,3.13
+dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),234.2,79.38,71.54,1
+dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),114.54,38.24,69.29,0.98
+deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),208.65,74.02,78.68,1.64
+deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),111.84,39.9,71.66,1.75
+pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),226.66,65.65,77.26,2.72
+dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),202.43,69.55,73.9,0.87
+dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),112.82,38.46,66.98,0.79
+deepseek-coder:6.7b-instruct-q4_K_M,A10G (AWS g5.xlarge),186.81,66.03,79.62,1.52
+deepseek-coder:6.7b-instruct-q3_K_M,A10G (AWS g5.xlarge),99.83,35.41,84.47,1.69
+pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,A10G (AWS g5.xlarge),212.08,86.58,79.02,3.35
+dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,A10G (AWS g5.xlarge),187.2,62.24,75.91,1
+dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,A10G (AWS g5.xlarge),102.36,34.29,81.23,1.02