From b3ec3b4e805ec92397bac8c3153ab6fe1297094e Mon Sep 17 00:00:00 2001 From: F-G Fernandez <26927750+frgfm@users.noreply.github.com> Date: Tue, 19 Mar 2024 19:17:45 +0100 Subject: [PATCH] feat(scripts): clean LLM latency benchmark (#132) * refactor(scripts): update latency script * docs(readme): update script instructions * docs(latency): update latency benchmark * feat(scripts): add docker for latency bench * docs(scripts): add a README * docs(readme): update readme --- README.md | 23 +--- scripts/latency.csv | 4 - scripts/ollama/Dockerfile | 17 +++ scripts/ollama/README.md | 65 +++++++++++ scripts/ollama/docker-compose.yml | 37 ++++++ .../evaluate_latency.py} | 106 +++++++++--------- scripts/ollama/latency.csv | 22 ++++ 7 files changed, 193 insertions(+), 81 deletions(-) delete mode 100644 scripts/latency.csv create mode 100644 scripts/ollama/Dockerfile create mode 100644 scripts/ollama/README.md create mode 100644 scripts/ollama/docker-compose.yml rename scripts/{evaluate_ollama_latency.py => ollama/evaluate_latency.py} (51%) create mode 100644 scripts/ollama/latency.csv diff --git a/README.md b/README.md index ce7ff14..4279a99 100644 --- a/README.md +++ b/README.md @@ -52,33 +52,14 @@ Quack is the AI coding companion that helps software teams ship faster. See it a The backend API is the gatekeeper for your LLM inference container (powered by our friend at [Ollama](https://github.com/ollama/ollama)). With your services up and running, you can use the code chat endpoint as coding-specific LLM chat. +*Check our [LLM latency benchmark](scripts/ollama) on a few cloud providers if you want to run this in the cloud.* + ### REST API for guideline management & LLM inference With the service running, you can navigate to [`http://localhost:8050/docs`](http://localhost:8050/docs) to interact with the API (or do it through HTTP requests) and explore the documentation. ![API Swagger screenshot](https://github.com/quack-ai/contribution-api/assets/26927750/725e8308-ace1-40ed-b742-242f8186fec0) -### Latency benchmark - -You crave for perfect codde suggestions, but you don't know whether it fits your needs in terms of latency? -In the table below, you will find a latency benchmark for all tested LLMs from Ollama: - -| Model | Ingestion mean (std) | Generation mean (std) | -| ------------------------------------------------------------ | ---------------------- | --------------------- | -| [tinyllama:1.1b-chat-v1-q4_0](https://ollama.com/library/tinyllama:1.1b-chat-v1-q4_0) | 2014.63 tok/s (±12.62) | 227.13 tok/s (±2.26) | -| [dolphin-phi:2.7b-v2.6-q4_0](https://ollama.com/library/dolphin-phi:2.7b-v2.6-q4_0) | 684.07 tok/s (±3.85) | 122.25 toks/s (±0.87) | -| [dolphin-mistral:7b-v2.6](https://ollama.com/library/dolphin-mistral:7b-v2.6) | 291.94 tok/s (±0.4) | 60.56 tok/s (±0.15) | - - -This benchmark was performed over 20 iterations on the same input sequence, on a **laptop** to better reflect performances that can be expected by common users. The hardware setup includes an [Intel(R) Core(TM) i7-12700H](https://ark.intel.com/content/www/us/en/ark/products/132228/intel-core-i7-12700h-processor-24m-cache-up-to-4-70-ghz.html) for the CPU, and a [NVIDIA GeForce RTX 3060](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3060-3060ti/) for the laptop GPU. - -You can run this latency benchmark for any Ollama model on your hardware as follows: -```bash -python scripts/evaluate_ollama_latency.py dolphin-mistral:7b-v2.6-dpo-laser-q4_0 --endpoint http://localhost:3000 -``` - -*All script arguments can be checked using `python scripts/evaluate_ollama_latency.py --help`* - ## Get started 🚀 diff --git a/scripts/latency.csv b/scripts/latency.csv deleted file mode 100644 index 7332694..0000000 --- a/scripts/latency.csv +++ /dev/null @@ -1,4 +0,0 @@ -model,hardware,ingestion_mean (tok/s),ingestion_std (tok/s),generation_mean (tok/s),generation_std (tok/s) -dolphin-mistral:7b-v2.6,NVIDIA RTX 3060 (laptop),291.94,0.4,60.56,0.15 -dolphin-phi:2.7b-v2.6-q4_0,NVIDIA RTX 3060 (laptop),684.07,3.85,122.25,0.87 -tinyllama:1.1b-chat-v1-q4_0,NVIDIA RTX 3060 (laptop),2014.63,12.62,227.13,2.26 diff --git a/scripts/ollama/Dockerfile b/scripts/ollama/Dockerfile new file mode 100644 index 0000000..da09260 --- /dev/null +++ b/scripts/ollama/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-alpine3.19 + +WORKDIR /app + +# set environment variables +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 +ENV PYTHONPATH "${PYTHONPATH}:/app" + +# install dependencies +RUN set -eux \ + && pip install --no-cache-dir uv \ + && uv pip install --no-cache --system requests==2.31.0 tqdm==4.66.2 numpy==1.26.4 \ + && rm -rf /root/.cache + +# copy script +COPY ./evaluate_latency.py /app/evaluate.py diff --git a/scripts/ollama/README.md b/scripts/ollama/README.md new file mode 100644 index 0000000..f01bd9b --- /dev/null +++ b/scripts/ollama/README.md @@ -0,0 +1,65 @@ +# LLM throughput benchmark + +## The benchmark + +You crave for perfect code suggestions, but you don't know whether it fits your needs in terms of latency? + +We ran our tests on the following hardware: + +- [NVIDIA GeForce RTX 3060](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3060-3060ti/) (mobile)* +- [NVIDIA GeForce RTX 3070](https://www.nvidia.com/fr-fr/geforce/graphics-cards/30-series/rtx-3070-3070ti/) ([Scaleway GPU-3070-S](https://www.scaleway.com/en/pricing/?tags=compute)) +- [NVIDIA A10](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([Lambda Cloud gpu_1x_a10](https://lambdalabs.com/service/gpu-cloud#pricing)) +- [NVIDIA A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) ([AWS g5.xlarge](https://aws.amazon.com/ec2/instance-types/g5/)) + +*The laptop hardware setup includes an [Intel(R) Core(TM) i7-12700H](https://ark.intel.com/content/www/us/en/ark/products/132228/intel-core-i7-12700h-processor-24m-cache-up-to-4-70-ghz.html) for the CPU* + +with the following LLMs (cf. [Ollama hub](https://ollama.com/library)): +- Deepseek Coder 6.7b - instruct ([Ollama](https://ollama.com/library/deepseek-coder), [HuggingFace](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)) +- OpenCodeInterpreter 6.7b ([Ollama](https://ollama.com/pxlksr/opencodeinterpreter-ds), [HuggingFace](https://huggingface.co/m-a-p/OpenCodeInterpreter-DS-6.7B), [paper](https://arxiv.org/abs/2402.14658)) +- Dolphin Mistral 7b ([Ollama](https://ollama.com/library/dolphin-mistral), [HuggingFace](https://huggingface.co/cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser), [paper](https://arxiv.org/abs/2310.06825)) +- Coming soon: StarChat v2 ([HuggingFace](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1), [paper](https://arxiv.org/abs/2402.19173)) + +and the following quantization formats: q3_K_M, q4_K_M, q5_K_M. + +This [benchmark](latency.csv) was performed over 5 iterations on 4 different sequences, including on a **laptop** to better reflect performances that can be expected by common users. + +## Run it on your hardware + +### Local setup + +Quite simply, start the docker: +``` +docker compose up -d --wait +``` +Pull the model you want +``` +docker compose exec -T ollama ollama pull MODEL +``` + +And run the evaluation +``` +docker compose exec -T evaluator python scripts/ollama/evaluate_latency.py MODEL +``` + +### Remote instance + +Start the evaluator only +``` +docker compose up -d evaluator --wait +``` +And run the evaluation by targeting your remote instance: +``` +docker compose exec -T evaluator python scripts/ollama/evaluate_latency.py MODEL --endpoint http://HOST:PORT +``` + +*All script arguments can be checked using `python scripts/ollama/evaluate_latency.py --help`* + +### Others + +Here are the results for other LLMs that have have only been evaluated on the laptop GPU: + +| Model | Ingestion mean (std) | Generation mean (std) | +| ------------------------------------------------------------ | ---------------------- | --------------------- | +| [tinyllama:1.1b-chat-v1-q4_0](https://ollama.com/library/tinyllama:1.1b-chat-v1-q4_0) | 2014.63 tok/s (±12.62) | 227.13 tok/s (±2.26) | +| [dolphin-phi:2.7b-v2.6-q4_0](https://ollama.com/library/dolphin-phi:2.7b-v2.6-q4_0) | 684.07 tok/s (±3.85) | 122.25 toks/s (±0.87) | +| [dolphin-mistral:7b-v2.6](https://ollama.com/library/dolphin-mistral:7b-v2.6) | 291.94 tok/s (±0.4) | 60.56 tok/s (±0.15) | diff --git a/scripts/ollama/docker-compose.yml b/scripts/ollama/docker-compose.yml new file mode 100644 index 0000000..3d220a9 --- /dev/null +++ b/scripts/ollama/docker-compose.yml @@ -0,0 +1,37 @@ +version: '3.7' + +services: + ollama: + image: ollama/ollama:0.1.29 + ports: + - "11434:11434" + volumes: + - "$HOME/.ollama:/root/.ollama" + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + command: serve + healthcheck: + test: ["CMD-SHELL", "ollama --help"] + interval: 10s + timeout: 5s + retries: 3 + + evaluator: + image: quackai/evaluator:latest + build: . + depends_on: + ollama: + condition: service_healthy + environment: + - OLLAMA_ENDPOINT=http://ollama:11434 + volumes: + - ./evaluate_latency.py:/app/evaluate.py + command: sleep infinity + +volumes: + ollama: diff --git a/scripts/evaluate_ollama_latency.py b/scripts/ollama/evaluate_latency.py similarity index 51% rename from scripts/evaluate_ollama_latency.py rename to scripts/ollama/evaluate_latency.py index b4627e2..3a0d18a 100644 --- a/scripts/evaluate_ollama_latency.py +++ b/scripts/ollama/evaluate_latency.py @@ -3,13 +3,17 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -import time +import logging +import os +from datetime import datetime, timezone from typing import Any, Dict import numpy as np import requests from tqdm import tqdm +logger = logging.getLogger(__name__) + def _generate( endpoint: str, model: str, system: str, prompt: str, temperature: float = 0.0, timeout: int = 60 @@ -20,6 +24,7 @@ def _generate( "model": model, "stream": False, "options": {"temperature": temperature}, + "keep_alive": "1s", "system": system, "prompt": prompt, }, @@ -36,6 +41,7 @@ def _chat_completion( "model": model, "stream": False, "options": {"temperature": temperature}, + "keep_alive": "1s", "messages": [{"role": "system", "content": system}, {"role": "user", "content": prompt}], }, timeout=timeout, @@ -43,7 +49,7 @@ def _chat_completion( def _format_response(response, system, prompt) -> Dict[str, Any]: - assert response.status_code == 200 + assert response.status_code == 200, print(response.__dict__) json_response = response.json() return { "duration": { @@ -60,71 +66,57 @@ def _format_response(response, system, prompt) -> Dict[str, Any]: } +SYSTEM_PROMPT = ( + "You are a helpful assistant, you will be given a coding task. Answer correctly, otherwise someone will die." +) +USER_PROMPTS = ( + "Write a Python function to compute the n-th fibonacci number", + "What's the difference between a Promise and an observable in Javascript", + "How are you?", + "Tell me about LLMs", +) + + def main(args): print(args) # Healthcheck on endpoint & model assert requests.get(f"{args.endpoint}/api/tags", timeout=2).status_code == 200 - response = requests.post(f"{args.endpoint}/api/pull", json={"name": args.model, "stream": False}, timeout=10) - assert response.status_code == 200 - assert response.json()["status"] == "success" + assert requests.post(f"{args.endpoint}/api/show", json={"name": args.model}, timeout=2).status_code == 200 - # Speed - speed_system = ( - "You are a helpful assistant, you will be given a coding task. Answer correctly, otherwise someone will die." - ) - speed_prompt = "Write a Python function to compute the n-th fibonacci number" # Warmup - for _ in range(args.warmup): - _generate(args.endpoint, args.model, speed_system, speed_prompt) - - # Run - timings = [] - input_chars, output_chars = [], [] - input_tokens, output_tokens = [], [] - load_duration, input_duration, output_duration, total_duration = [], [], [], [] - for _ in tqdm(range(args.it)): - start_ts = time.perf_counter() - response = _generate(args.endpoint, args.model, speed_system, speed_prompt) - timings.append(time.perf_counter() - start_ts) - inference = _format_response(response, speed_system, speed_prompt) - input_chars.append(inference["chars"]["input"]) - output_chars.append(inference["chars"]["output"]) + _chat_completion(args.endpoint, args.model, SYSTEM_PROMPT, "Hello") + # Evaluation run + input_tokens, output_tokens, input_duration, output_duration, load_duration = [], [], [], [], [] + for user_prompt in tqdm(USER_PROMPTS * args.it): + response = _chat_completion(args.endpoint, args.model, SYSTEM_PROMPT, user_prompt) + inference = _format_response(response, SYSTEM_PROMPT, user_prompt) input_tokens.append(inference["tokens"]["input"]) output_tokens.append(inference["tokens"]["output"]) - load_duration.append(inference["duration"]["model"]) input_duration.append(inference["duration"]["input"]) output_duration.append(inference["duration"]["output"]) - total_duration.append(inference["duration"]["total"]) - - print(f"{args.model} ({args.it} runs)") - timings = np.array(timings) - load_duration = np.array(load_duration, dtype=int) + load_duration.append(inference["duration"]["model"]) + # Aggregate information input_duration = np.array(input_duration, dtype=int) output_duration = np.array(output_duration, dtype=int) - total_duration = np.array(total_duration, dtype=int) - print(f"Model load duration: mean {load_duration.mean() / 1e6:.2f}ms, std {load_duration.std() / 1e6:.2f}ms") + load_duration = np.array(load_duration, dtype=int) # Tokens (np.float64 to handle NaNs) input_tokens = np.array(input_tokens, dtype=np.float64) output_tokens = np.array(output_tokens, dtype=np.float64) - input_chars = np.array(input_chars, dtype=np.float64) - output_chars = np.array(output_chars, dtype=np.float64) - print( - f"Input processing: mean {1e9 * input_tokens.sum() / input_duration.sum():.2f} tok/s, std {1e9 * (input_tokens / input_duration).std():.2f} tok/s" - ) - print( - f"Output generation: mean {1e9 * output_tokens.sum() / output_duration.sum():.2f} tok/s, std {1e9 * (output_tokens / output_duration).std():.2f} tok/s" - ) + result = { + "created_at": str(datetime.now(timezone.utc)), + "ingestion_mean": 1e9 * input_tokens.sum() / input_duration.sum(), + "ingestion_std": 1e9 * (input_tokens / input_duration).std(), + "generation_mean": 1e9 * output_tokens.sum() / output_duration.sum(), + "generation_std": 1e9 * (output_tokens / output_duration).std(), + "load_mean": load_duration.mean() / 1e6, + "load_std": load_duration.std() / 1e6, + } - # Chars - print( - f"Input processing: mean {1e9 * input_chars.sum() / input_duration.sum():.2f} char/s, std {1e9 * (input_chars / input_duration).std():.2f} char/s" - ) - print( - f"Output generation: mean {1e9 * output_chars.sum() / output_duration.sum():.2f} char/s, std {1e9 * (output_chars / output_duration).std():.2f} char/s" - ) - print(f"Overall latency (ollama): mean {total_duration.mean() / 1e6:.2f}ms, std {total_duration.std() / 1e6:.2f}ms") - print(f"Overall latency (HTTP): mean {1000 * timings.mean():.2f}ms, std {1000 * timings.std():.2f}ms") + print(f"{args.model} ({args.it} runs) at {result['created_at']}") + print(f"Model load duration: mean {result['load_mean']:.2f}ms, std {result['load_std']:.2f}ms") + print(f"Ingestion: mean {result['ingestion_mean']:.2f} tok/s, std {result['ingestion_std']:.2f} tok/s") + print(f"Generation: mean {result['generation_mean']:.2f} tok/s, std {result['generation_std']:.2f} tok/s") def get_parser(): @@ -135,18 +127,20 @@ def get_parser(): ) # Data & model - group = parser.add_argument_group("Data & model") - group.add_argument("model", type=str, help="model to use") - group.add_argument("--endpoint", default="http://localhost:11434/api", type=str, help="Ollama endpoint") + group = parser.add_argument_group("LLM setup") + group.add_argument("model", type=str, help="model to evaluate") + group.add_argument( + "--endpoint", default=os.getenv("OLLAMA_ENDPOINT", "http://ollama:11434"), type=str, help="Ollama endpoint" + ) # Inference params group = parser.add_argument_group("Inference params") group.add_argument("--temperature", default=0, type=float, help="Temperature to use for model inference") - # Inference params + # # Inference params group = parser.add_argument_group("Evaluation") - group.add_argument("--it", type=int, default=20, help="Number of iterations to run") - group.add_argument("--warmup", type=int, default=5, help="Number of iterations for warmup") + group.add_argument("--it", type=int, default=5, help="Number of iterations to run") + # group.add_argument("--warmup", type=int, default=3, help="Number of iterations for warmup") return parser diff --git a/scripts/ollama/latency.csv b/scripts/ollama/latency.csv new file mode 100644 index 0000000..4ffdf00 --- /dev/null +++ b/scripts/ollama/latency.csv @@ -0,0 +1,22 @@ +model,hardware,ingestion_mean (tok/s),ingestion_std (tok/s),generation_mean (tok/s),generation_std (tok/s) +deepseek-coder:6.7b-instruct-q5_K_M,NVIDIA RTX 3060 (laptop),35.43,3.46,23.68,0.74 +deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA RTX 3060 (laptop),72.27,10.69,36.82,1.25 +deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA RTX 3060 (laptop),90.1,32.43,50.34,1.28 +pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA RTX 3060 (laptop),78.94,10.2,37.95,1.65 +dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA RTX 3060 (laptop),126.75,31.5,50.05,0.84 +dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA RTX 3060 (laptop),89.47,29.91,47.09,0.67 +deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),266.98,95.63,75.53,1.56 +deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),141.43,50.4,73.69,1.61 +pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),285.81,73.55,75.14,3.13 +dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),234.2,79.38,71.54,1 +dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA RTX 3070 (Scaleway GPU-3070-S),114.54,38.24,69.29,0.98 +deepseek-coder:6.7b-instruct-q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),208.65,74.02,78.68,1.64 +deepseek-coder:6.7b-instruct-q3_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),111.84,39.9,71.66,1.75 +pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),226.66,65.65,77.26,2.72 +dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),202.43,69.55,73.9,0.87 +dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,NVIDIA A10 (Lambda Cloud gpu_1x_a10),112.82,38.46,66.98,0.79 +deepseek-coder:6.7b-instruct-q4_K_M,A10G (AWS g5.xlarge),186.81,66.03,79.62,1.52 +deepseek-coder:6.7b-instruct-q3_K_M,A10G (AWS g5.xlarge),99.83,35.41,84.47,1.69 +pxlksr/opencodeinterpreter-ds:6.7b-Q4_K_M,A10G (AWS g5.xlarge),212.08,86.58,79.02,3.35 +dolphin-mistral:7b-v2.6-dpo-laser-q4_K_M,A10G (AWS g5.xlarge),187.2,62.24,75.91,1 +dolphin-mistral:7b-v2.6-dpo-laser-q3_K_M,A10G (AWS g5.xlarge),102.36,34.29,81.23,1.02